{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 384.0, "learning_rate": 1.18e-05, "loss": 99.3112, "loss/crossentropy": 9.301286220550537, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.171189308166504, "step": 2 }, { "epoch": 0.00025, "grad_norm": 388.0, "learning_rate": 1.3600000000000002e-05, "loss": 98.5376, "loss/crossentropy": 9.283345699310303, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.322719573974609, "step": 4 }, { "epoch": 0.000375, "grad_norm": 380.0, "learning_rate": 1.54e-05, "loss": 98.4698, "loss/crossentropy": 9.26666784286499, "loss/hidden": 16.625, "loss/jsd": 0.0, "loss/logits": 7.094146490097046, "step": 6 }, { "epoch": 0.0005, "grad_norm": 187.0, "learning_rate": 1.72e-05, "loss": 95.8811, "loss/crossentropy": 9.060422420501709, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 6.9519524574279785, "step": 8 }, { "epoch": 0.000625, "grad_norm": 158.0, "learning_rate": 1.9e-05, "loss": 91.1537, "loss/crossentropy": 8.855913162231445, "loss/hidden": 16.375, "loss/jsd": 0.0, "loss/logits": 6.698125123977661, "step": 10 }, { "epoch": 0.00075, "grad_norm": 135.0, "learning_rate": 2.0800000000000004e-05, "loss": 89.0469, "loss/crossentropy": 8.479426860809326, "loss/hidden": 16.3125, "loss/jsd": 0.0, "loss/logits": 6.1546266078948975, "step": 12 }, { "epoch": 0.000875, "grad_norm": 119.0, "learning_rate": 2.2600000000000004e-05, "loss": 87.3701, "loss/crossentropy": 8.417439937591553, "loss/hidden": 16.25, "loss/jsd": 0.0, "loss/logits": 6.330978155136108, "step": 14 }, { "epoch": 0.001, "grad_norm": 98.0, "grad_norm_var": 15809.7625, "learning_rate": 2.4400000000000004e-05, "loss": 81.7839, "loss/crossentropy": 7.888103723526001, "loss/hidden": 15.8125, "loss/jsd": 0.0, "loss/logits": 5.809406042098999, "step": 16 }, { "epoch": 0.001125, "grad_norm": 278.0, "grad_norm_var": 12072.916666666666, "learning_rate": 2.6200000000000003e-05, "loss": 83.0321, "loss/crossentropy": 7.949460506439209, "loss/hidden": 15.34375, "loss/jsd": 0.0, "loss/logits": 6.00595760345459, "step": 18 }, { "epoch": 0.00125, "grad_norm": 67.5, "grad_norm_var": 8976.948958333332, "learning_rate": 2.8000000000000003e-05, "loss": 79.5947, "loss/crossentropy": 7.64544939994812, "loss/hidden": 15.25, "loss/jsd": 0.0, "loss/logits": 5.5388875007629395, "step": 20 }, { "epoch": 0.001375, "grad_norm": 38.5, "grad_norm_var": 4950.315625, "learning_rate": 2.9800000000000006e-05, "loss": 74.6424, "loss/crossentropy": 7.209100246429443, "loss/hidden": 15.15625, "loss/jsd": 0.0, "loss/logits": 5.076019763946533, "step": 22 }, { "epoch": 0.0015, "grad_norm": 54.5, "grad_norm_var": 4140.295833333334, "learning_rate": 3.16e-05, "loss": 71.7249, "loss/crossentropy": 7.1052405834198, "loss/hidden": 15.0, "loss/jsd": 0.0, "loss/logits": 5.032779216766357, "step": 24 }, { "epoch": 0.001625, "grad_norm": 90.5, "grad_norm_var": 3923.795833333333, "learning_rate": 3.3400000000000005e-05, "loss": 69.0909, "loss/crossentropy": 6.593599557876587, "loss/hidden": 14.9375, "loss/jsd": 0.0, "loss/logits": 4.861028671264648, "step": 26 }, { "epoch": 0.00175, "grad_norm": 49.0, "grad_norm_var": 4052.4239583333333, "learning_rate": 3.520000000000001e-05, "loss": 64.7694, "loss/crossentropy": 6.363184213638306, "loss/hidden": 14.59375, "loss/jsd": 0.0, "loss/logits": 4.430697441101074, "step": 28 }, { "epoch": 0.001875, "grad_norm": 47.0, "grad_norm_var": 4244.118489583333, "learning_rate": 3.7e-05, "loss": 59.3223, "loss/crossentropy": 5.989596843719482, "loss/hidden": 13.84375, "loss/jsd": 0.0, "loss/logits": 4.165619850158691, "step": 30 }, { "epoch": 0.002, "grad_norm": 71.0, "grad_norm_var": 4305.730989583333, "learning_rate": 3.88e-05, "loss": 55.1302, "loss/crossentropy": 5.726909637451172, "loss/hidden": 13.53125, "loss/jsd": 0.0, "loss/logits": 3.7759565114974976, "step": 32 }, { "epoch": 0.002125, "grad_norm": 60.0, "grad_norm_var": 934.0955729166667, "learning_rate": 4.0600000000000004e-05, "loss": 50.1945, "loss/crossentropy": 5.208499431610107, "loss/hidden": 13.125, "loss/jsd": 0.0, "loss/logits": 3.081121802330017, "step": 34 }, { "epoch": 0.00225, "grad_norm": 46.75, "grad_norm_var": 266.72083333333336, "learning_rate": 4.240000000000001e-05, "loss": 46.3994, "loss/crossentropy": 4.913021564483643, "loss/hidden": 12.375, "loss/jsd": 0.0, "loss/logits": 2.866178512573242, "step": 36 }, { "epoch": 0.002375, "grad_norm": 51.75, "grad_norm_var": 237.22395833333334, "learning_rate": 4.420000000000001e-05, "loss": 42.1907, "loss/crossentropy": 4.504716157913208, "loss/hidden": 12.0625, "loss/jsd": 0.0, "loss/logits": 2.6037776470184326, "step": 38 }, { "epoch": 0.0025, "grad_norm": 50.0, "grad_norm_var": 236.25390625, "learning_rate": 4.600000000000001e-05, "loss": 39.115, "loss/crossentropy": 4.373331546783447, "loss/hidden": 11.375, "loss/jsd": 0.0, "loss/logits": 2.2266069650650024, "step": 40 }, { "epoch": 0.002625, "grad_norm": 33.0, "grad_norm_var": 164.56640625, "learning_rate": 4.78e-05, "loss": 36.1801, "loss/crossentropy": 4.276909589767456, "loss/hidden": 11.0625, "loss/jsd": 0.0, "loss/logits": 2.2537089586257935, "step": 42 }, { "epoch": 0.00275, "grad_norm": 41.0, "grad_norm_var": 170.54765625, "learning_rate": 4.96e-05, "loss": 33.7672, "loss/crossentropy": 3.979385256767273, "loss/hidden": 10.59375, "loss/jsd": 0.0, "loss/logits": 1.776978850364685, "step": 44 }, { "epoch": 0.002875, "grad_norm": 31.5, "grad_norm_var": 205.69140625, "learning_rate": 5.14e-05, "loss": 31.4663, "loss/crossentropy": 3.5722849369049072, "loss/hidden": 10.15625, "loss/jsd": 0.0, "loss/logits": 1.7410615682601929, "step": 46 }, { "epoch": 0.003, "grad_norm": 21.375, "grad_norm_var": 211.38020833333334, "learning_rate": 5.3200000000000006e-05, "loss": 29.7082, "loss/crossentropy": 3.679291844367981, "loss/hidden": 9.625, "loss/jsd": 0.0, "loss/logits": 1.594287633895874, "step": 48 }, { "epoch": 0.003125, "grad_norm": 23.125, "grad_norm_var": 105.7416015625, "learning_rate": 5.500000000000001e-05, "loss": 28.489, "loss/crossentropy": 3.9182190895080566, "loss/hidden": 9.40625, "loss/jsd": 0.0, "loss/logits": 1.5025497078895569, "step": 50 }, { "epoch": 0.00325, "grad_norm": 29.875, "grad_norm_var": 105.0306640625, "learning_rate": 5.680000000000001e-05, "loss": 27.5703, "loss/crossentropy": 3.526407241821289, "loss/hidden": 9.25, "loss/jsd": 0.0, "loss/logits": 1.494104266166687, "step": 52 }, { "epoch": 0.003375, "grad_norm": 19.625, "grad_norm_var": 99.2416015625, "learning_rate": 5.860000000000001e-05, "loss": 26.1189, "loss/crossentropy": 3.4616609811782837, "loss/hidden": 9.0, "loss/jsd": 0.0, "loss/logits": 1.3545405268669128, "step": 54 }, { "epoch": 0.0035, "grad_norm": 22.5, "grad_norm_var": 54.81920572916667, "learning_rate": 6.040000000000001e-05, "loss": 24.328, "loss/crossentropy": 3.308198928833008, "loss/hidden": 8.75, "loss/jsd": 0.0, "loss/logits": 1.2083913683891296, "step": 56 }, { "epoch": 0.003625, "grad_norm": 14.0625, "grad_norm_var": 59.79152018229167, "learning_rate": 6.220000000000001e-05, "loss": 24.2188, "loss/crossentropy": 3.5452929735183716, "loss/hidden": 8.4375, "loss/jsd": 0.0, "loss/logits": 1.2204867601394653, "step": 58 }, { "epoch": 0.00375, "grad_norm": 15.75, "grad_norm_var": 52.173177083333336, "learning_rate": 6.400000000000001e-05, "loss": 22.8282, "loss/crossentropy": 3.1143264770507812, "loss/hidden": 8.40625, "loss/jsd": 0.0, "loss/logits": 1.1705525517463684, "step": 60 }, { "epoch": 0.003875, "grad_norm": 20.125, "grad_norm_var": 38.25930989583333, "learning_rate": 6.58e-05, "loss": 22.306, "loss/crossentropy": 3.136604428291321, "loss/hidden": 7.96875, "loss/jsd": 0.0, "loss/logits": 1.1404522061347961, "step": 62 }, { "epoch": 0.004, "grad_norm": 16.5, "grad_norm_var": 40.18274739583333, "learning_rate": 6.76e-05, "loss": 21.058, "loss/crossentropy": 2.9673322439193726, "loss/hidden": 7.703125, "loss/jsd": 0.0, "loss/logits": 1.0015667080879211, "step": 64 }, { "epoch": 0.004125, "grad_norm": 11.6875, "grad_norm_var": 39.946614583333336, "learning_rate": 6.94e-05, "loss": 21.0828, "loss/crossentropy": 3.2232860326766968, "loss/hidden": 7.546875, "loss/jsd": 0.0, "loss/logits": 0.964312881231308, "step": 66 }, { "epoch": 0.00425, "grad_norm": 15.4375, "grad_norm_var": 32.87902018229167, "learning_rate": 7.120000000000001e-05, "loss": 20.2688, "loss/crossentropy": 3.371062755584717, "loss/hidden": 7.53125, "loss/jsd": 0.0, "loss/logits": 0.971402496099472, "step": 68 }, { "epoch": 0.004375, "grad_norm": 11.8125, "grad_norm_var": 37.155322265625, "learning_rate": 7.3e-05, "loss": 19.6652, "loss/crossentropy": 2.8037211894989014, "loss/hidden": 7.40625, "loss/jsd": 0.0, "loss/logits": 0.952717661857605, "step": 70 }, { "epoch": 0.0045, "grad_norm": 116.5, "grad_norm_var": 640.5003743489583, "learning_rate": 7.48e-05, "loss": 19.6559, "loss/crossentropy": 2.9093810319900513, "loss/hidden": 7.15625, "loss/jsd": 0.0, "loss/logits": 0.9704654216766357, "step": 72 }, { "epoch": 0.004625, "grad_norm": 9.4375, "grad_norm_var": 651.0841145833333, "learning_rate": 7.66e-05, "loss": 18.7849, "loss/crossentropy": 2.824882984161377, "loss/hidden": 7.0625, "loss/jsd": 0.0, "loss/logits": 0.8673952519893646, "step": 74 }, { "epoch": 0.00475, "grad_norm": 21.875, "grad_norm_var": 649.4197916666667, "learning_rate": 7.840000000000001e-05, "loss": 18.5261, "loss/crossentropy": 2.8125277757644653, "loss/hidden": 7.109375, "loss/jsd": 0.0, "loss/logits": 0.8680737912654877, "step": 76 }, { "epoch": 0.004875, "grad_norm": 12.1875, "grad_norm_var": 658.5675618489583, "learning_rate": 8.020000000000001e-05, "loss": 18.4968, "loss/crossentropy": 2.8050509691238403, "loss/hidden": 6.828125, "loss/jsd": 0.0, "loss/logits": 0.8595540523529053, "step": 78 }, { "epoch": 0.005, "grad_norm": 11.5, "grad_norm_var": 669.7122233072917, "learning_rate": 8.200000000000001e-05, "loss": 18.0691, "loss/crossentropy": 3.2670862674713135, "loss/hidden": 6.8125, "loss/jsd": 0.0, "loss/logits": 0.8241342604160309, "step": 80 }, { "epoch": 0.005125, "grad_norm": 12.625, "grad_norm_var": 669.6054524739583, "learning_rate": 8.38e-05, "loss": 17.4693, "loss/crossentropy": 2.700217127799988, "loss/hidden": 6.765625, "loss/jsd": 0.0, "loss/logits": 0.8141748309135437, "step": 82 }, { "epoch": 0.00525, "grad_norm": 11.4375, "grad_norm_var": 679.333056640625, "learning_rate": 8.560000000000001e-05, "loss": 16.8553, "loss/crossentropy": 2.619894862174988, "loss/hidden": 6.578125, "loss/jsd": 0.0, "loss/logits": 0.755303144454956, "step": 84 }, { "epoch": 0.005375, "grad_norm": 10.8125, "grad_norm_var": 680.3473307291666, "learning_rate": 8.740000000000001e-05, "loss": 16.8983, "loss/crossentropy": 2.8718719482421875, "loss/hidden": 6.484375, "loss/jsd": 0.0, "loss/logits": 0.8335458338260651, "step": 86 }, { "epoch": 0.0055, "grad_norm": 13.8125, "grad_norm_var": 8.321858723958334, "learning_rate": 8.92e-05, "loss": 16.8672, "loss/crossentropy": 2.806625247001648, "loss/hidden": 6.546875, "loss/jsd": 0.0, "loss/logits": 0.7781052589416504, "step": 88 }, { "epoch": 0.005625, "grad_norm": 12.1875, "grad_norm_var": 7.507014973958333, "learning_rate": 9.1e-05, "loss": 16.4737, "loss/crossentropy": 3.016478180885315, "loss/hidden": 6.375, "loss/jsd": 0.0, "loss/logits": 0.7285971939563751, "step": 90 }, { "epoch": 0.00575, "grad_norm": 13.25, "grad_norm_var": 1.43046875, "learning_rate": 9.28e-05, "loss": 16.47, "loss/crossentropy": 2.5847216844558716, "loss/hidden": 6.359375, "loss/jsd": 0.0, "loss/logits": 0.6861400604248047, "step": 92 }, { "epoch": 0.005875, "grad_norm": 9.875, "grad_norm_var": 1.4640462239583334, "learning_rate": 9.46e-05, "loss": 16.3726, "loss/crossentropy": 2.6700236797332764, "loss/hidden": 6.328125, "loss/jsd": 0.0, "loss/logits": 0.7058612108230591, "step": 94 }, { "epoch": 0.006, "grad_norm": 11.6875, "grad_norm_var": 1.329541015625, "learning_rate": 9.64e-05, "loss": 16.0121, "loss/crossentropy": 2.8999104499816895, "loss/hidden": 6.203125, "loss/jsd": 0.0, "loss/logits": 0.7061053812503815, "step": 96 }, { "epoch": 0.006125, "grad_norm": 12.25, "grad_norm_var": 1.3148274739583334, "learning_rate": 9.82e-05, "loss": 15.9048, "loss/crossentropy": 2.9132989645004272, "loss/hidden": 6.234375, "loss/jsd": 0.0, "loss/logits": 0.7202947437763214, "step": 98 }, { "epoch": 0.00625, "grad_norm": 10.1875, "grad_norm_var": 1.2620930989583334, "learning_rate": 0.0001, "loss": 15.4134, "loss/crossentropy": 2.6530884504318237, "loss/hidden": 5.90625, "loss/jsd": 0.0, "loss/logits": 0.6581160724163055, "step": 100 }, { "epoch": 0.006375, "grad_norm": 12.8125, "grad_norm_var": 2.23736572265625, "learning_rate": 0.0001, "loss": 15.5444, "loss/crossentropy": 2.285265564918518, "loss/hidden": 6.15625, "loss/jsd": 0.0, "loss/logits": 0.6365102231502533, "step": 102 }, { "epoch": 0.0065, "grad_norm": 12.25, "grad_norm_var": 1.9623006184895833, "learning_rate": 0.0001, "loss": 15.3962, "loss/crossentropy": 2.9150387048721313, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 0.6743068099021912, "step": 104 }, { "epoch": 0.006625, "grad_norm": 11.375, "grad_norm_var": 1.9106730143229167, "learning_rate": 0.0001, "loss": 15.0494, "loss/crossentropy": 2.461984634399414, "loss/hidden": 5.875, "loss/jsd": 0.0, "loss/logits": 0.5759885013103485, "step": 106 }, { "epoch": 0.00675, "grad_norm": 9.4375, "grad_norm_var": 1.83931884765625, "learning_rate": 0.0001, "loss": 15.2, "loss/crossentropy": 2.545448660850525, "loss/hidden": 5.828125, "loss/jsd": 0.0, "loss/logits": 0.6016611158847809, "step": 108 }, { "epoch": 0.006875, "grad_norm": 9.5, "grad_norm_var": 2.117997233072917, "learning_rate": 0.0001, "loss": 14.7974, "loss/crossentropy": 2.70013689994812, "loss/hidden": 5.8125, "loss/jsd": 0.0, "loss/logits": 0.6241994798183441, "step": 110 }, { "epoch": 0.007, "grad_norm": 11.6875, "grad_norm_var": 2.129715983072917, "learning_rate": 0.0001, "loss": 14.9825, "loss/crossentropy": 2.7020071744918823, "loss/hidden": 5.765625, "loss/jsd": 0.0, "loss/logits": 0.6234863996505737, "step": 112 }, { "epoch": 0.007125, "grad_norm": 10.0, "grad_norm_var": 2.9886067708333335, "learning_rate": 0.0001, "loss": 14.717, "loss/crossentropy": 2.513030529022217, "loss/hidden": 5.765625, "loss/jsd": 0.0, "loss/logits": 0.6006259322166443, "step": 114 }, { "epoch": 0.00725, "grad_norm": 9.8125, "grad_norm_var": 2.908317057291667, "learning_rate": 0.0001, "loss": 14.5928, "loss/crossentropy": 2.696964979171753, "loss/hidden": 5.640625, "loss/jsd": 0.0, "loss/logits": 0.6187423169612885, "step": 116 }, { "epoch": 0.007375, "grad_norm": 8.0, "grad_norm_var": 2.4882771809895834, "learning_rate": 0.0001, "loss": 14.4255, "loss/crossentropy": 2.6013330221176147, "loss/hidden": 5.578125, "loss/jsd": 0.0, "loss/logits": 0.6197507381439209, "step": 118 }, { "epoch": 0.0075, "grad_norm": 9.5625, "grad_norm_var": 2.2302042643229165, "learning_rate": 0.0001, "loss": 14.3271, "loss/crossentropy": 2.411963939666748, "loss/hidden": 5.609375, "loss/jsd": 0.0, "loss/logits": 0.625188797712326, "step": 120 }, { "epoch": 0.007625, "grad_norm": 7.84375, "grad_norm_var": 2.220686848958333, "learning_rate": 0.0001, "loss": 14.2801, "loss/crossentropy": 2.6053736209869385, "loss/hidden": 5.6875, "loss/jsd": 0.0, "loss/logits": 0.6165933012962341, "step": 122 }, { "epoch": 0.00775, "grad_norm": 8.9375, "grad_norm_var": 1.343603515625, "learning_rate": 0.0001, "loss": 14.212, "loss/crossentropy": 2.693827986717224, "loss/hidden": 5.546875, "loss/jsd": 0.0, "loss/logits": 0.593802809715271, "step": 124 }, { "epoch": 0.007875, "grad_norm": 8.9375, "grad_norm_var": 1.392822265625, "learning_rate": 0.0001, "loss": 14.083, "loss/crossentropy": 2.649814248085022, "loss/hidden": 5.484375, "loss/jsd": 0.0, "loss/logits": 0.5663131475448608, "step": 126 }, { "epoch": 0.008, "grad_norm": 7.90625, "grad_norm_var": 0.8796183268229166, "learning_rate": 0.0001, "loss": 13.7585, "loss/crossentropy": 2.813218355178833, "loss/hidden": 5.640625, "loss/jsd": 0.0, "loss/logits": 0.619841456413269, "step": 128 }, { "epoch": 0.008125, "grad_norm": 9.4375, "grad_norm_var": 0.5834635416666667, "learning_rate": 0.0001, "loss": 13.7834, "loss/crossentropy": 2.496381640434265, "loss/hidden": 5.34375, "loss/jsd": 0.0, "loss/logits": 0.534807562828064, "step": 130 }, { "epoch": 0.00825, "grad_norm": 7.15625, "grad_norm_var": 0.73668212890625, "learning_rate": 0.0001, "loss": 13.8102, "loss/crossentropy": 2.587761878967285, "loss/hidden": 5.6875, "loss/jsd": 0.0, "loss/logits": 0.5979687869548798, "step": 132 }, { "epoch": 0.008375, "grad_norm": 11.0625, "grad_norm_var": 0.9780232747395833, "learning_rate": 0.0001, "loss": 13.7661, "loss/crossentropy": 3.0042346715927124, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 0.568140983581543, "step": 134 }, { "epoch": 0.0085, "grad_norm": 9.5625, "grad_norm_var": 0.8787394205729167, "learning_rate": 0.0001, "loss": 13.7338, "loss/crossentropy": 2.4913647174835205, "loss/hidden": 5.53125, "loss/jsd": 0.0, "loss/logits": 0.5167834609746933, "step": 136 }, { "epoch": 0.008625, "grad_norm": 7.4375, "grad_norm_var": 1.00718994140625, "learning_rate": 0.0001, "loss": 13.6016, "loss/crossentropy": 2.468238115310669, "loss/hidden": 5.28125, "loss/jsd": 0.0, "loss/logits": 0.5614461004734039, "step": 138 }, { "epoch": 0.00875, "grad_norm": 8.8125, "grad_norm_var": 1.0800740559895834, "learning_rate": 0.0001, "loss": 13.5316, "loss/crossentropy": 2.4444445371627808, "loss/hidden": 5.3125, "loss/jsd": 0.0, "loss/logits": 0.5187713205814362, "step": 140 }, { "epoch": 0.008875, "grad_norm": 6.96875, "grad_norm_var": 1.2684895833333334, "learning_rate": 0.0001, "loss": 13.0968, "loss/crossentropy": 2.6214382648468018, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5483916699886322, "step": 142 }, { "epoch": 0.009, "grad_norm": 8.375, "grad_norm_var": 1.2163045247395834, "learning_rate": 0.0001, "loss": 13.202, "loss/crossentropy": 2.7945733070373535, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5334698259830475, "step": 144 }, { "epoch": 0.009125, "grad_norm": 8.375, "grad_norm_var": 1.11070556640625, "learning_rate": 0.0001, "loss": 13.4962, "loss/crossentropy": 2.6263811588287354, "loss/hidden": 5.3125, "loss/jsd": 0.0, "loss/logits": 0.5304541736841202, "step": 146 }, { "epoch": 0.00925, "grad_norm": 6.46875, "grad_norm_var": 1.1537394205729166, "learning_rate": 0.0001, "loss": 13.0194, "loss/crossentropy": 2.446092367172241, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.4983871430158615, "step": 148 }, { "epoch": 0.009375, "grad_norm": 8.875, "grad_norm_var": 0.7020182291666667, "learning_rate": 0.0001, "loss": 13.4196, "loss/crossentropy": 2.954146981239319, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5594009757041931, "step": 150 }, { "epoch": 0.0095, "grad_norm": 7.9375, "grad_norm_var": 0.563916015625, "learning_rate": 0.0001, "loss": 13.1519, "loss/crossentropy": 2.7116650342941284, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5489330589771271, "step": 152 }, { "epoch": 0.009625, "grad_norm": 6.875, "grad_norm_var": 0.66314697265625, "learning_rate": 0.0001, "loss": 12.9977, "loss/crossentropy": 2.6282447576522827, "loss/hidden": 5.1875, "loss/jsd": 0.0, "loss/logits": 0.4889778196811676, "step": 154 }, { "epoch": 0.00975, "grad_norm": 6.5625, "grad_norm_var": 0.7279256184895834, "learning_rate": 0.0001, "loss": 12.9168, "loss/crossentropy": 2.5541906356811523, "loss/hidden": 5.140625, "loss/jsd": 0.0, "loss/logits": 0.5468989908695221, "step": 156 }, { "epoch": 0.009875, "grad_norm": 8.25, "grad_norm_var": 0.7774739583333333, "learning_rate": 0.0001, "loss": 12.8049, "loss/crossentropy": 2.3998714685440063, "loss/hidden": 5.171875, "loss/jsd": 0.0, "loss/logits": 0.4967530369758606, "step": 158 }, { "epoch": 0.01, "grad_norm": 8.125, "grad_norm_var": 0.6618448893229166, "learning_rate": 0.0001, "loss": 12.8303, "loss/crossentropy": 2.5461435317993164, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.5074218511581421, "step": 160 }, { "epoch": 0.010125, "grad_norm": 7.8125, "grad_norm_var": 0.6085245768229167, "learning_rate": 0.0001, "loss": 12.7334, "loss/crossentropy": 2.2296184301376343, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.4967309385538101, "step": 162 }, { "epoch": 0.01025, "grad_norm": 7.28125, "grad_norm_var": 0.48121337890625, "learning_rate": 0.0001, "loss": 12.8275, "loss/crossentropy": 2.348438858985901, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.47058284282684326, "step": 164 }, { "epoch": 0.010375, "grad_norm": 7.0, "grad_norm_var": 0.39407552083333336, "learning_rate": 0.0001, "loss": 12.6261, "loss/crossentropy": 2.4020251035690308, "loss/hidden": 4.96875, "loss/jsd": 0.0, "loss/logits": 0.4758017808198929, "step": 166 }, { "epoch": 0.0105, "grad_norm": 5.65625, "grad_norm_var": 0.6719685872395833, "learning_rate": 0.0001, "loss": 12.7439, "loss/crossentropy": 2.4506293535232544, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.5312856733798981, "step": 168 }, { "epoch": 0.010625, "grad_norm": 7.40625, "grad_norm_var": 0.6060506184895833, "learning_rate": 0.0001, "loss": 12.582, "loss/crossentropy": 2.5331802368164062, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.5095243901014328, "step": 170 }, { "epoch": 0.01075, "grad_norm": 8.625, "grad_norm_var": 0.6595662434895834, "learning_rate": 0.0001, "loss": 12.4879, "loss/crossentropy": 2.65364670753479, "loss/hidden": 5.140625, "loss/jsd": 0.0, "loss/logits": 0.543939620256424, "step": 172 }, { "epoch": 0.010875, "grad_norm": 6.21875, "grad_norm_var": 0.6911417643229166, "learning_rate": 0.0001, "loss": 12.2931, "loss/crossentropy": 2.2634752988815308, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.45544371008872986, "step": 174 }, { "epoch": 0.011, "grad_norm": 7.21875, "grad_norm_var": 0.6577473958333333, "learning_rate": 0.0001, "loss": 12.3996, "loss/crossentropy": 2.3653087615966797, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.4665989428758621, "step": 176 }, { "epoch": 0.011125, "grad_norm": 7.8125, "grad_norm_var": 0.6873006184895833, "learning_rate": 0.0001, "loss": 12.335, "loss/crossentropy": 2.274166226387024, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.48722338676452637, "step": 178 }, { "epoch": 0.01125, "grad_norm": 8.875, "grad_norm_var": 0.8094685872395834, "learning_rate": 0.0001, "loss": 12.4249, "loss/crossentropy": 2.464481830596924, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.5341024994850159, "step": 180 }, { "epoch": 0.011375, "grad_norm": 6.84375, "grad_norm_var": 0.8151692708333333, "learning_rate": 0.0001, "loss": 12.12, "loss/crossentropy": 2.5521204471588135, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.5266247987747192, "step": 182 }, { "epoch": 0.0115, "grad_norm": 6.375, "grad_norm_var": 0.6587076822916667, "learning_rate": 0.0001, "loss": 12.0581, "loss/crossentropy": 2.380069375038147, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.4448339492082596, "step": 184 }, { "epoch": 0.011625, "grad_norm": 7.46875, "grad_norm_var": 0.6198527018229166, "learning_rate": 0.0001, "loss": 12.2557, "loss/crossentropy": 2.6351869106292725, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.51109179854393, "step": 186 }, { "epoch": 0.01175, "grad_norm": 6.625, "grad_norm_var": 0.461572265625, "learning_rate": 0.0001, "loss": 12.3102, "loss/crossentropy": 2.606539011001587, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.4482808858156204, "step": 188 }, { "epoch": 0.011875, "grad_norm": 7.3125, "grad_norm_var": 0.40491129557291666, "learning_rate": 0.0001, "loss": 11.8983, "loss/crossentropy": 2.5177031755447388, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.4657522886991501, "step": 190 }, { "epoch": 0.012, "grad_norm": 5.40625, "grad_norm_var": 0.5950358072916667, "learning_rate": 0.0001, "loss": 11.8962, "loss/crossentropy": 2.5478276014328003, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.4511236548423767, "step": 192 }, { "epoch": 0.012125, "grad_norm": 7.25, "grad_norm_var": 0.5598592122395833, "learning_rate": 0.0001, "loss": 11.9408, "loss/crossentropy": 2.08566415309906, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.4315005987882614, "step": 194 }, { "epoch": 0.01225, "grad_norm": 7.03125, "grad_norm_var": 0.40950113932291665, "learning_rate": 0.0001, "loss": 12.0969, "loss/crossentropy": 2.634473204612732, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4682666063308716, "step": 196 }, { "epoch": 0.012375, "grad_norm": 7.25, "grad_norm_var": 0.4279296875, "learning_rate": 0.0001, "loss": 12.0544, "loss/crossentropy": 2.6797198057174683, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4552183598279953, "step": 198 }, { "epoch": 0.0125, "grad_norm": 6.75, "grad_norm_var": 0.4488118489583333, "learning_rate": 0.0001, "loss": 11.9949, "loss/crossentropy": 2.9568880796432495, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.5152240097522736, "step": 200 }, { "epoch": 0.012625, "grad_norm": 6.1875, "grad_norm_var": 0.4198527018229167, "learning_rate": 0.0001, "loss": 11.876, "loss/crossentropy": 2.4664944410324097, "loss/hidden": 4.734375, "loss/jsd": 0.0, "loss/logits": 0.4684390127658844, "step": 202 }, { "epoch": 0.01275, "grad_norm": 6.5625, "grad_norm_var": 0.41952718098958336, "learning_rate": 0.0001, "loss": 11.9644, "loss/crossentropy": 2.580668091773987, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.4738956689834595, "step": 204 }, { "epoch": 0.012875, "grad_norm": 6.3125, "grad_norm_var": 0.39635009765625, "learning_rate": 0.0001, "loss": 12.0377, "loss/crossentropy": 2.367862343788147, "loss/hidden": 4.671875, "loss/jsd": 0.0, "loss/logits": 0.45996397733688354, "step": 206 }, { "epoch": 0.013, "grad_norm": 6.09375, "grad_norm_var": 0.27681884765625, "learning_rate": 0.0001, "loss": 11.9037, "loss/crossentropy": 2.5246529579162598, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.46563032269477844, "step": 208 }, { "epoch": 0.013125, "grad_norm": 6.46875, "grad_norm_var": 0.24596354166666667, "learning_rate": 0.0001, "loss": 11.7128, "loss/crossentropy": 2.20585036277771, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.41547301411628723, "step": 210 }, { "epoch": 0.01325, "grad_norm": 5.9375, "grad_norm_var": 0.2263671875, "learning_rate": 0.0001, "loss": 11.7218, "loss/crossentropy": 2.3064881563186646, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4325401932001114, "step": 212 }, { "epoch": 0.013375, "grad_norm": 7.03125, "grad_norm_var": 0.16054280598958334, "learning_rate": 0.0001, "loss": 11.5407, "loss/crossentropy": 2.3898541927337646, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.43332116305828094, "step": 214 }, { "epoch": 0.0135, "grad_norm": 6.5625, "grad_norm_var": 0.150244140625, "learning_rate": 0.0001, "loss": 11.8046, "loss/crossentropy": 2.456748604774475, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.4493984282016754, "step": 216 }, { "epoch": 0.013625, "grad_norm": 6.25, "grad_norm_var": 0.18606770833333333, "learning_rate": 0.0001, "loss": 11.8232, "loss/crossentropy": 2.7504690885543823, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.44959259033203125, "step": 218 }, { "epoch": 0.01375, "grad_norm": 6.65625, "grad_norm_var": 0.18313802083333333, "learning_rate": 0.0001, "loss": 11.7742, "loss/crossentropy": 2.272356390953064, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4380947947502136, "step": 220 }, { "epoch": 0.013875, "grad_norm": 5.71875, "grad_norm_var": 0.20331624348958333, "learning_rate": 0.0001, "loss": 11.7707, "loss/crossentropy": 2.539223790168762, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.41349247097969055, "step": 222 }, { "epoch": 0.014, "grad_norm": 6.3125, "grad_norm_var": 0.21122639973958332, "learning_rate": 0.0001, "loss": 11.6392, "loss/crossentropy": 2.6272025108337402, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.4406648874282837, "step": 224 }, { "epoch": 0.014125, "grad_norm": 5.65625, "grad_norm_var": 0.23255208333333333, "learning_rate": 0.0001, "loss": 11.3455, "loss/crossentropy": 2.1202113032341003, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.42472922801971436, "step": 226 }, { "epoch": 0.01425, "grad_norm": 6.5625, "grad_norm_var": 0.23212483723958333, "learning_rate": 0.0001, "loss": 11.5275, "loss/crossentropy": 2.420728087425232, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4595801681280136, "step": 228 }, { "epoch": 0.014375, "grad_norm": 5.65625, "grad_norm_var": 0.22057291666666667, "learning_rate": 0.0001, "loss": 11.849, "loss/crossentropy": 2.58668851852417, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.44095560908317566, "step": 230 }, { "epoch": 0.0145, "grad_norm": 5.28125, "grad_norm_var": 0.210546875, "learning_rate": 0.0001, "loss": 11.4306, "loss/crossentropy": 2.3560508489608765, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.4567548930644989, "step": 232 }, { "epoch": 0.014625, "grad_norm": 5.78125, "grad_norm_var": 0.19872639973958334, "learning_rate": 0.0001, "loss": 11.41, "loss/crossentropy": 2.362083673477173, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.4453700929880142, "step": 234 }, { "epoch": 0.01475, "grad_norm": 6.625, "grad_norm_var": 0.18801676432291667, "learning_rate": 0.0001, "loss": 11.3155, "loss/crossentropy": 2.6302807331085205, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.45155879855155945, "step": 236 }, { "epoch": 0.014875, "grad_norm": 6.28125, "grad_norm_var": 0.16236979166666668, "learning_rate": 0.0001, "loss": 11.5746, "loss/crossentropy": 2.504029393196106, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.423601895570755, "step": 238 }, { "epoch": 0.015, "grad_norm": 5.84375, "grad_norm_var": 0.14752604166666666, "learning_rate": 0.0001, "loss": 11.3137, "loss/crossentropy": 2.496834635734558, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.44761495292186737, "step": 240 }, { "epoch": 0.015125, "grad_norm": 5.75, "grad_norm_var": 0.14377848307291666, "learning_rate": 0.0001, "loss": 11.215, "loss/crossentropy": 2.5440114736557007, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.4467613846063614, "step": 242 }, { "epoch": 0.01525, "grad_norm": 6.8125, "grad_norm_var": 0.16174723307291666, "learning_rate": 0.0001, "loss": 11.4828, "loss/crossentropy": 2.3933448791503906, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4251607805490494, "step": 244 }, { "epoch": 0.015375, "grad_norm": 5.28125, "grad_norm_var": 0.19026285807291668, "learning_rate": 0.0001, "loss": 11.2663, "loss/crossentropy": 2.709121346473694, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.4511110782623291, "step": 246 }, { "epoch": 0.0155, "grad_norm": 6.375, "grad_norm_var": 0.18007405598958334, "learning_rate": 0.0001, "loss": 11.2542, "loss/crossentropy": 2.65135395526886, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.42732760310173035, "step": 248 }, { "epoch": 0.015625, "grad_norm": 5.5, "grad_norm_var": 0.25390218098958334, "learning_rate": 0.0001, "loss": 11.0863, "loss/crossentropy": 2.531478524208069, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.39773157238960266, "step": 250 }, { "epoch": 0.01575, "grad_norm": 5.75, "grad_norm_var": 0.23079427083333334, "learning_rate": 0.0001, "loss": 11.3519, "loss/crossentropy": 2.1759145259857178, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.40788644552230835, "step": 252 }, { "epoch": 0.015875, "grad_norm": 6.28125, "grad_norm_var": 0.246337890625, "learning_rate": 0.0001, "loss": 10.9975, "loss/crossentropy": 2.486463189125061, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4049537926912308, "step": 254 }, { "epoch": 0.016, "grad_norm": 4.90625, "grad_norm_var": 0.32945556640625, "learning_rate": 0.0001, "loss": 10.9813, "loss/crossentropy": 2.3456650972366333, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.3914157599210739, "step": 256 }, { "epoch": 0.016125, "grad_norm": 5.8125, "grad_norm_var": 0.35302327473958334, "learning_rate": 0.0001, "loss": 11.124, "loss/crossentropy": 2.7621407508850098, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4217756986618042, "step": 258 }, { "epoch": 0.01625, "grad_norm": 5.09375, "grad_norm_var": 0.30201416015625, "learning_rate": 0.0001, "loss": 11.0913, "loss/crossentropy": 2.520516872406006, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.42682692408561707, "step": 260 }, { "epoch": 0.016375, "grad_norm": 5.875, "grad_norm_var": 0.289697265625, "learning_rate": 0.0001, "loss": 11.0406, "loss/crossentropy": 2.668264865875244, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.44012486934661865, "step": 262 }, { "epoch": 0.0165, "grad_norm": 5.71875, "grad_norm_var": 0.221337890625, "learning_rate": 0.0001, "loss": 11.1313, "loss/crossentropy": 2.6228344440460205, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.47360049188137054, "step": 264 }, { "epoch": 0.016625, "grad_norm": 5.1875, "grad_norm_var": 0.2591796875, "learning_rate": 0.0001, "loss": 10.8727, "loss/crossentropy": 2.07044917345047, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3736244738101959, "step": 266 }, { "epoch": 0.01675, "grad_norm": 5.59375, "grad_norm_var": 0.24763997395833334, "learning_rate": 0.0001, "loss": 10.9825, "loss/crossentropy": 2.195676624774933, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.41074641048908234, "step": 268 }, { "epoch": 0.016875, "grad_norm": 5.15625, "grad_norm_var": 0.16717122395833334, "learning_rate": 0.0001, "loss": 10.9858, "loss/crossentropy": 2.6045761108398438, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.43537537753582, "step": 270 }, { "epoch": 0.017, "grad_norm": 5.40625, "grad_norm_var": 0.17616780598958334, "learning_rate": 0.0001, "loss": 11.1417, "loss/crossentropy": 2.343075156211853, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.3932172954082489, "step": 272 }, { "epoch": 0.017125, "grad_norm": 5.96875, "grad_norm_var": 0.18162434895833332, "learning_rate": 0.0001, "loss": 10.9988, "loss/crossentropy": 2.4649263620376587, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3909170925617218, "step": 274 }, { "epoch": 0.01725, "grad_norm": 4.5, "grad_norm_var": 0.23905843098958332, "learning_rate": 0.0001, "loss": 10.9516, "loss/crossentropy": 2.3632274866104126, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.39788326621055603, "step": 276 }, { "epoch": 0.017375, "grad_norm": 5.78125, "grad_norm_var": 0.24231363932291666, "learning_rate": 0.0001, "loss": 10.9188, "loss/crossentropy": 2.5861356258392334, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.41938433051109314, "step": 278 }, { "epoch": 0.0175, "grad_norm": 5.21875, "grad_norm_var": 0.23834635416666666, "learning_rate": 0.0001, "loss": 10.92, "loss/crossentropy": 2.5754419565200806, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4088515043258667, "step": 280 }, { "epoch": 0.017625, "grad_norm": 5.53125, "grad_norm_var": 0.21184895833333334, "learning_rate": 0.0001, "loss": 10.7969, "loss/crossentropy": 2.241589307785034, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.38494938611984253, "step": 282 }, { "epoch": 0.01775, "grad_norm": 5.40625, "grad_norm_var": 0.3136678059895833, "learning_rate": 0.0001, "loss": 10.9124, "loss/crossentropy": 2.332160472869873, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4286513030529022, "step": 284 }, { "epoch": 0.017875, "grad_norm": 5.84375, "grad_norm_var": 0.2992472330729167, "learning_rate": 0.0001, "loss": 10.8958, "loss/crossentropy": 2.452752709388733, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.37612101435661316, "step": 286 }, { "epoch": 0.018, "grad_norm": 4.84375, "grad_norm_var": 0.249072265625, "learning_rate": 0.0001, "loss": 10.7195, "loss/crossentropy": 2.2290940284729004, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.37554706633090973, "step": 288 }, { "epoch": 0.018125, "grad_norm": 5.875, "grad_norm_var": 0.24680989583333332, "learning_rate": 0.0001, "loss": 10.7342, "loss/crossentropy": 2.4139484167099, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.40335869789123535, "step": 290 }, { "epoch": 0.01825, "grad_norm": 4.78125, "grad_norm_var": 0.21402587890625, "learning_rate": 0.0001, "loss": 10.6719, "loss/crossentropy": 2.56483793258667, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.41296976804733276, "step": 292 }, { "epoch": 0.018375, "grad_norm": 6.375, "grad_norm_var": 0.24823811848958333, "learning_rate": 0.0001, "loss": 10.9241, "loss/crossentropy": 2.3277297019958496, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.3984246253967285, "step": 294 }, { "epoch": 0.0185, "grad_norm": 4.71875, "grad_norm_var": 0.28815104166666666, "learning_rate": 0.0001, "loss": 10.7746, "loss/crossentropy": 2.3450149297714233, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.3996598720550537, "step": 296 }, { "epoch": 0.018625, "grad_norm": 5.03125, "grad_norm_var": 0.352734375, "learning_rate": 0.0001, "loss": 10.5621, "loss/crossentropy": 2.317037582397461, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.40160971879959106, "step": 298 }, { "epoch": 0.01875, "grad_norm": 5.3125, "grad_norm_var": 0.23987223307291666, "learning_rate": 0.0001, "loss": 10.9369, "loss/crossentropy": 2.5068975687026978, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.410100519657135, "step": 300 }, { "epoch": 0.018875, "grad_norm": 4.9375, "grad_norm_var": 0.22589518229166666, "learning_rate": 0.0001, "loss": 10.7482, "loss/crossentropy": 2.351959705352783, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.39118409156799316, "step": 302 }, { "epoch": 0.019, "grad_norm": 5.59375, "grad_norm_var": 0.21608072916666668, "learning_rate": 0.0001, "loss": 10.7665, "loss/crossentropy": 2.3877638578414917, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.39836424589157104, "step": 304 }, { "epoch": 0.019125, "grad_norm": 5.53125, "grad_norm_var": 0.19894205729166667, "learning_rate": 0.0001, "loss": 10.7703, "loss/crossentropy": 2.600021004676819, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.41590385138988495, "step": 306 }, { "epoch": 0.01925, "grad_norm": 5.40625, "grad_norm_var": 0.18485921223958332, "learning_rate": 0.0001, "loss": 10.6674, "loss/crossentropy": 2.4758663177490234, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.39325758814811707, "step": 308 }, { "epoch": 0.019375, "grad_norm": 5.25, "grad_norm_var": 0.10755208333333334, "learning_rate": 0.0001, "loss": 10.5967, "loss/crossentropy": 2.3135849237442017, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.3709346354007721, "step": 310 }, { "epoch": 0.0195, "grad_norm": 4.34375, "grad_norm_var": 0.20885416666666667, "learning_rate": 0.0001, "loss": 10.7752, "loss/crossentropy": 2.4139580726623535, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4005644619464874, "step": 312 }, { "epoch": 0.019625, "grad_norm": 6.03125, "grad_norm_var": 0.21534830729166668, "learning_rate": 0.0001, "loss": 10.779, "loss/crossentropy": 2.5271564722061157, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.37980175018310547, "step": 314 }, { "epoch": 0.01975, "grad_norm": 4.53125, "grad_norm_var": 0.25982666015625, "learning_rate": 0.0001, "loss": 10.5117, "loss/crossentropy": 2.5739694833755493, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.40202172100543976, "step": 316 }, { "epoch": 0.019875, "grad_norm": 6.625, "grad_norm_var": 0.36412353515625, "learning_rate": 0.0001, "loss": 10.5532, "loss/crossentropy": 2.245994448661804, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.36041176319122314, "step": 318 }, { "epoch": 0.02, "grad_norm": 4.65625, "grad_norm_var": 0.392041015625, "learning_rate": 0.0001, "loss": 10.5765, "loss/crossentropy": 2.5354862213134766, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3732207715511322, "step": 320 }, { "epoch": 0.020125, "grad_norm": 5.8125, "grad_norm_var": 0.463134765625, "learning_rate": 0.0001, "loss": 10.4762, "loss/crossentropy": 2.3753507137298584, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3840855211019516, "step": 322 }, { "epoch": 0.02025, "grad_norm": 4.90625, "grad_norm_var": 0.4641927083333333, "learning_rate": 0.0001, "loss": 10.2923, "loss/crossentropy": 2.4433764219284058, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.38050127029418945, "step": 324 }, { "epoch": 0.020375, "grad_norm": 5.21875, "grad_norm_var": 0.5068644205729167, "learning_rate": 0.0001, "loss": 10.6858, "loss/crossentropy": 2.484220504760742, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4011761546134949, "step": 326 }, { "epoch": 0.0205, "grad_norm": 5.3125, "grad_norm_var": 0.3748697916666667, "learning_rate": 0.0001, "loss": 10.7329, "loss/crossentropy": 2.6139092445373535, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.4130321443080902, "step": 328 }, { "epoch": 0.020625, "grad_norm": 5.0, "grad_norm_var": 0.31311442057291666, "learning_rate": 0.0001, "loss": 10.606, "loss/crossentropy": 2.4625054597854614, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.3967062383890152, "step": 330 }, { "epoch": 0.02075, "grad_norm": 5.0625, "grad_norm_var": 0.2874837239583333, "learning_rate": 0.0001, "loss": 10.5546, "loss/crossentropy": 2.3454889059066772, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.3706536889076233, "step": 332 }, { "epoch": 0.020875, "grad_norm": 5.4375, "grad_norm_var": 0.18229166666666666, "learning_rate": 0.0001, "loss": 10.5487, "loss/crossentropy": 2.4348955154418945, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.37259407341480255, "step": 334 }, { "epoch": 0.021, "grad_norm": 4.25, "grad_norm_var": 0.21061197916666666, "learning_rate": 0.0001, "loss": 10.3802, "loss/crossentropy": 2.3722680807113647, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.37501636147499084, "step": 336 }, { "epoch": 0.021125, "grad_norm": 5.6875, "grad_norm_var": 0.17509358723958332, "learning_rate": 0.0001, "loss": 10.6118, "loss/crossentropy": 2.367267608642578, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.37614843249320984, "step": 338 }, { "epoch": 0.02125, "grad_norm": 4.96875, "grad_norm_var": 0.18136393229166667, "learning_rate": 0.0001, "loss": 10.409, "loss/crossentropy": 2.5342684984207153, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4095509201288223, "step": 340 }, { "epoch": 0.021375, "grad_norm": 4.25, "grad_norm_var": 0.200634765625, "learning_rate": 0.0001, "loss": 10.347, "loss/crossentropy": 2.366121530532837, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.38898931443691254, "step": 342 }, { "epoch": 0.0215, "grad_norm": 5.96875, "grad_norm_var": 0.391650390625, "learning_rate": 0.0001, "loss": 10.4284, "loss/crossentropy": 2.435874819755554, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4087076783180237, "step": 344 }, { "epoch": 0.021625, "grad_norm": 5.46875, "grad_norm_var": 0.39010416666666664, "learning_rate": 0.0001, "loss": 10.4112, "loss/crossentropy": 2.4133975505828857, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3934263288974762, "step": 346 }, { "epoch": 0.02175, "grad_norm": 4.8125, "grad_norm_var": 0.40168863932291665, "learning_rate": 0.0001, "loss": 10.59, "loss/crossentropy": 2.3464980125427246, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.36868566274642944, "step": 348 }, { "epoch": 0.021875, "grad_norm": 5.15625, "grad_norm_var": 0.3732421875, "learning_rate": 0.0001, "loss": 10.5281, "loss/crossentropy": 2.492926597595215, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.37226299941539764, "step": 350 }, { "epoch": 0.022, "grad_norm": 4.25, "grad_norm_var": 0.37948811848958336, "learning_rate": 0.0001, "loss": 10.2246, "loss/crossentropy": 2.3636070489883423, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3681965172290802, "step": 352 }, { "epoch": 0.022125, "grad_norm": 4.625, "grad_norm_var": 0.3692708333333333, "learning_rate": 0.0001, "loss": 10.5053, "loss/crossentropy": 2.288292169570923, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.3958088457584381, "step": 354 }, { "epoch": 0.02225, "grad_norm": 5.15625, "grad_norm_var": 0.38084309895833335, "learning_rate": 0.0001, "loss": 10.3737, "loss/crossentropy": 2.451295018196106, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.39023011922836304, "step": 356 }, { "epoch": 0.022375, "grad_norm": 4.59375, "grad_norm_var": 0.3459269205729167, "learning_rate": 0.0001, "loss": 10.3773, "loss/crossentropy": 2.3242127895355225, "loss/hidden": 4.0546875, "loss/jsd": 0.0, "loss/logits": 0.39542824029922485, "step": 358 }, { "epoch": 0.0225, "grad_norm": 5.03125, "grad_norm_var": 0.132666015625, "learning_rate": 0.0001, "loss": 10.531, "loss/crossentropy": 2.5108137130737305, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3721470236778259, "step": 360 }, { "epoch": 0.022625, "grad_norm": 4.8125, "grad_norm_var": 0.101416015625, "learning_rate": 0.0001, "loss": 10.4355, "loss/crossentropy": 2.282769203186035, "loss/hidden": 4.0859375, "loss/jsd": 0.0, "loss/logits": 0.353266179561615, "step": 362 }, { "epoch": 0.02275, "grad_norm": 5.40625, "grad_norm_var": 0.13072916666666667, "learning_rate": 0.0001, "loss": 10.3682, "loss/crossentropy": 2.3975025415420532, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3848300874233246, "step": 364 }, { "epoch": 0.022875, "grad_norm": 4.59375, "grad_norm_var": 0.13717447916666667, "learning_rate": 0.0001, "loss": 10.3197, "loss/crossentropy": 2.5082876682281494, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.38546115159988403, "step": 366 }, { "epoch": 0.023, "grad_norm": 5.75, "grad_norm_var": 0.23873697916666667, "learning_rate": 0.0001, "loss": 10.0812, "loss/crossentropy": 2.3333388566970825, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.35471296310424805, "step": 368 }, { "epoch": 0.023125, "grad_norm": 7.78125, "grad_norm_var": 0.6998697916666666, "learning_rate": 0.0001, "loss": 10.6427, "loss/crossentropy": 2.568707585334778, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3596802055835724, "step": 370 }, { "epoch": 0.02325, "grad_norm": 5.3125, "grad_norm_var": 0.73492431640625, "learning_rate": 0.0001, "loss": 10.3651, "loss/crossentropy": 2.393819808959961, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3487332612276077, "step": 372 }, { "epoch": 0.023375, "grad_norm": 5.03125, "grad_norm_var": 0.7124348958333333, "learning_rate": 0.0001, "loss": 10.2767, "loss/crossentropy": 2.472244381904602, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.40489913523197174, "step": 374 }, { "epoch": 0.0235, "grad_norm": 4.78125, "grad_norm_var": 0.7247029622395833, "learning_rate": 0.0001, "loss": 10.1461, "loss/crossentropy": 2.2458752393722534, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3645085096359253, "step": 376 }, { "epoch": 0.023625, "grad_norm": 4.40625, "grad_norm_var": 0.77890625, "learning_rate": 0.0001, "loss": 10.2872, "loss/crossentropy": 2.1981548070907593, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.38387705385684967, "step": 378 }, { "epoch": 0.02375, "grad_norm": 4.84375, "grad_norm_var": 0.7771443684895833, "learning_rate": 0.0001, "loss": 10.1281, "loss/crossentropy": 2.4362692832946777, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3760421574115753, "step": 380 }, { "epoch": 0.023875, "grad_norm": 4.65625, "grad_norm_var": 0.7679972330729167, "learning_rate": 0.0001, "loss": 10.2042, "loss/crossentropy": 2.4601200819015503, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.3811968266963959, "step": 382 }, { "epoch": 0.024, "grad_norm": 4.875, "grad_norm_var": 0.684228515625, "learning_rate": 0.0001, "loss": 10.2408, "loss/crossentropy": 2.6233471632003784, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.36839838325977325, "step": 384 }, { "epoch": 0.024125, "grad_norm": 4.6875, "grad_norm_var": 0.165478515625, "learning_rate": 0.0001, "loss": 10.1497, "loss/crossentropy": 2.522361993789673, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3934475779533386, "step": 386 }, { "epoch": 0.02425, "grad_norm": 4.5, "grad_norm_var": 0.14563395182291666, "learning_rate": 0.0001, "loss": 10.1857, "loss/crossentropy": 2.573809266090393, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.375105544924736, "step": 388 }, { "epoch": 0.024375, "grad_norm": 4.40625, "grad_norm_var": 0.13203125, "learning_rate": 0.0001, "loss": 10.0545, "loss/crossentropy": 2.458760142326355, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.348113551735878, "step": 390 }, { "epoch": 0.0245, "grad_norm": 4.8125, "grad_norm_var": 0.13248291015625, "learning_rate": 0.0001, "loss": 10.1765, "loss/crossentropy": 2.8183611631393433, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.39342811703681946, "step": 392 }, { "epoch": 0.024625, "grad_norm": 4.65625, "grad_norm_var": 0.11769205729166667, "learning_rate": 0.0001, "loss": 10.0009, "loss/crossentropy": 2.2332464456558228, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3468857705593109, "step": 394 }, { "epoch": 0.02475, "grad_norm": 4.6875, "grad_norm_var": 0.14143473307291668, "learning_rate": 0.0001, "loss": 9.9097, "loss/crossentropy": 2.3098992109298706, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3603272885084152, "step": 396 }, { "epoch": 0.024875, "grad_norm": 4.46875, "grad_norm_var": 0.15129801432291667, "learning_rate": 0.0001, "loss": 10.1234, "loss/crossentropy": 2.1571128964424133, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3672170788049698, "step": 398 }, { "epoch": 0.025, "grad_norm": 4.8125, "grad_norm_var": 0.11464436848958333, "learning_rate": 0.0001, "loss": 10.1366, "loss/crossentropy": 2.3940361738204956, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3588118702173233, "step": 400 }, { "epoch": 0.025125, "grad_norm": 4.3125, "grad_norm_var": 0.1046875, "learning_rate": 0.0001, "loss": 10.248, "loss/crossentropy": 2.4594138860702515, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3485400527715683, "step": 402 }, { "epoch": 0.02525, "grad_norm": 4.9375, "grad_norm_var": 0.10377197265625, "learning_rate": 0.0001, "loss": 10.0681, "loss/crossentropy": 2.524109125137329, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3557968735694885, "step": 404 }, { "epoch": 0.025375, "grad_norm": 5.03125, "grad_norm_var": 0.10859375, "learning_rate": 0.0001, "loss": 10.0777, "loss/crossentropy": 2.3012895584106445, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.34439629316329956, "step": 406 }, { "epoch": 0.0255, "grad_norm": 4.28125, "grad_norm_var": 0.11404622395833333, "learning_rate": 0.0001, "loss": 9.8043, "loss/crossentropy": 2.4211593866348267, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3732890635728836, "step": 408 }, { "epoch": 0.025625, "grad_norm": 4.40625, "grad_norm_var": 0.11438395182291666, "learning_rate": 0.0001, "loss": 9.9315, "loss/crossentropy": 2.582412362098694, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.36784547567367554, "step": 410 }, { "epoch": 0.02575, "grad_norm": 4.0625, "grad_norm_var": 0.07857666015625, "learning_rate": 0.0001, "loss": 9.9946, "loss/crossentropy": 2.525751233100891, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3565828502178192, "step": 412 }, { "epoch": 0.025875, "grad_norm": 4.28125, "grad_norm_var": 0.092578125, "learning_rate": 0.0001, "loss": 10.2235, "loss/crossentropy": 2.670364737510681, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.4017476439476013, "step": 414 }, { "epoch": 0.026, "grad_norm": 4.4375, "grad_norm_var": 0.08396809895833333, "learning_rate": 0.0001, "loss": 9.8333, "loss/crossentropy": 2.3298041820526123, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3325035125017166, "step": 416 }, { "epoch": 0.026125, "grad_norm": 4.34375, "grad_norm_var": 0.08318684895833334, "learning_rate": 0.0001, "loss": 10.0788, "loss/crossentropy": 2.240808844566345, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.33864179253578186, "step": 418 }, { "epoch": 0.02625, "grad_norm": 4.4375, "grad_norm_var": 0.06643473307291667, "learning_rate": 0.0001, "loss": 9.933, "loss/crossentropy": 2.397716999053955, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3628341108560562, "step": 420 }, { "epoch": 0.026375, "grad_norm": 4.375, "grad_norm_var": 0.06901041666666667, "learning_rate": 0.0001, "loss": 10.0998, "loss/crossentropy": 2.4601176977157593, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3522925227880478, "step": 422 }, { "epoch": 0.0265, "grad_norm": 5.5, "grad_norm_var": 0.15636393229166667, "learning_rate": 0.0001, "loss": 10.1351, "loss/crossentropy": 2.4023250341415405, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.37351013720035553, "step": 424 }, { "epoch": 0.026625, "grad_norm": 4.71875, "grad_norm_var": 0.16990559895833332, "learning_rate": 0.0001, "loss": 10.0776, "loss/crossentropy": 2.271855592727661, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3706711381673813, "step": 426 }, { "epoch": 0.02675, "grad_norm": 3.953125, "grad_norm_var": 0.17908426920572917, "learning_rate": 0.0001, "loss": 9.7587, "loss/crossentropy": 2.0610432028770447, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.3243980407714844, "step": 428 }, { "epoch": 0.026875, "grad_norm": 5.21875, "grad_norm_var": 0.20093485514322917, "learning_rate": 0.0001, "loss": 9.8788, "loss/crossentropy": 2.4309468269348145, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.4062986671924591, "step": 430 }, { "epoch": 0.027, "grad_norm": 4.65625, "grad_norm_var": 0.20027567545572916, "learning_rate": 0.0001, "loss": 10.0082, "loss/crossentropy": 2.4598418474197388, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.4272041916847229, "step": 432 }, { "epoch": 0.027125, "grad_norm": 5.03125, "grad_norm_var": 0.21314188639322917, "learning_rate": 0.0001, "loss": 9.9003, "loss/crossentropy": 2.539934992790222, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.35184885561466217, "step": 434 }, { "epoch": 0.02725, "grad_norm": 4.625, "grad_norm_var": 0.20462137858072918, "learning_rate": 0.0001, "loss": 9.9416, "loss/crossentropy": 2.269451856613159, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.35347896814346313, "step": 436 }, { "epoch": 0.027375, "grad_norm": 4.34375, "grad_norm_var": 0.1935455322265625, "learning_rate": 0.0001, "loss": 9.9578, "loss/crossentropy": 2.281239867210388, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3409070521593094, "step": 438 }, { "epoch": 0.0275, "grad_norm": 4.1875, "grad_norm_var": 0.1381988525390625, "learning_rate": 0.0001, "loss": 9.8183, "loss/crossentropy": 2.2763094305992126, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3811110109090805, "step": 440 }, { "epoch": 0.027625, "grad_norm": 4.59375, "grad_norm_var": 0.1252593994140625, "learning_rate": 0.0001, "loss": 9.833, "loss/crossentropy": 2.5895315408706665, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.36611178517341614, "step": 442 }, { "epoch": 0.02775, "grad_norm": 4.21875, "grad_norm_var": 0.10857747395833334, "learning_rate": 0.0001, "loss": 9.7918, "loss/crossentropy": 2.1158281564712524, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3374405652284622, "step": 444 }, { "epoch": 0.027875, "grad_norm": 5.96875, "grad_norm_var": 0.20597330729166666, "learning_rate": 0.0001, "loss": 9.9942, "loss/crossentropy": 2.446805953979492, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.37695541977882385, "step": 446 }, { "epoch": 0.028, "grad_norm": 5.3125, "grad_norm_var": 0.24869791666666666, "learning_rate": 0.0001, "loss": 10.0719, "loss/crossentropy": 2.5359551906585693, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3757418543100357, "step": 448 }, { "epoch": 0.028125, "grad_norm": 4.125, "grad_norm_var": 0.24947916666666667, "learning_rate": 0.0001, "loss": 9.8412, "loss/crossentropy": 2.656207323074341, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.42010098695755005, "step": 450 }, { "epoch": 0.02825, "grad_norm": 5.125, "grad_norm_var": 0.26568603515625, "learning_rate": 0.0001, "loss": 9.8241, "loss/crossentropy": 2.4152743816375732, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3626774847507477, "step": 452 }, { "epoch": 0.028375, "grad_norm": 4.3125, "grad_norm_var": 0.26858317057291664, "learning_rate": 0.0001, "loss": 10.136, "loss/crossentropy": 2.4247626066207886, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.36247318983078003, "step": 454 }, { "epoch": 0.0285, "grad_norm": 4.9375, "grad_norm_var": 0.2647939046223958, "learning_rate": 0.0001, "loss": 9.8054, "loss/crossentropy": 2.523893713951111, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.3691753149032593, "step": 456 }, { "epoch": 0.028625, "grad_norm": 5.875, "grad_norm_var": 0.6781646728515625, "learning_rate": 0.0001, "loss": 10.0115, "loss/crossentropy": 2.2744319438934326, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3499785512685776, "step": 458 }, { "epoch": 0.02875, "grad_norm": 4.96875, "grad_norm_var": 0.7750885009765625, "learning_rate": 0.0001, "loss": 9.8553, "loss/crossentropy": 2.383001685142517, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.34379810094833374, "step": 460 }, { "epoch": 0.028875, "grad_norm": 4.28125, "grad_norm_var": 0.7739084879557292, "learning_rate": 0.0001, "loss": 9.9509, "loss/crossentropy": 2.269408345222473, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.39304040372371674, "step": 462 }, { "epoch": 0.029, "grad_norm": 4.625, "grad_norm_var": 0.833544921875, "learning_rate": 0.0001, "loss": 9.8895, "loss/crossentropy": 2.305214285850525, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.32623225450515747, "step": 464 }, { "epoch": 0.029125, "grad_norm": 4.21875, "grad_norm_var": 0.8327799479166667, "learning_rate": 0.0001, "loss": 9.9362, "loss/crossentropy": 2.5358622074127197, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.35859355330467224, "step": 466 }, { "epoch": 0.02925, "grad_norm": 4.3125, "grad_norm_var": 0.8972981770833334, "learning_rate": 0.0001, "loss": 9.9957, "loss/crossentropy": 2.490285634994507, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.35838285088539124, "step": 468 }, { "epoch": 0.029375, "grad_norm": 4.84375, "grad_norm_var": 0.8911417643229167, "learning_rate": 0.0001, "loss": 9.9963, "loss/crossentropy": 2.388529062271118, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3805827349424362, "step": 470 }, { "epoch": 0.0295, "grad_norm": 4.09375, "grad_norm_var": 0.8791575113932292, "learning_rate": 0.0001, "loss": 9.9093, "loss/crossentropy": 2.464060425758362, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.35019560158252716, "step": 472 }, { "epoch": 0.029625, "grad_norm": 5.46875, "grad_norm_var": 0.37202046712239584, "learning_rate": 0.0001, "loss": 9.8783, "loss/crossentropy": 2.3649709224700928, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.3797526955604553, "step": 474 }, { "epoch": 0.02975, "grad_norm": 4.25, "grad_norm_var": 0.29755757649739584, "learning_rate": 0.0001, "loss": 9.8323, "loss/crossentropy": 2.4717437028884888, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.35063043236732483, "step": 476 }, { "epoch": 0.029875, "grad_norm": 4.34375, "grad_norm_var": 0.16384175618489583, "learning_rate": 0.0001, "loss": 9.9292, "loss/crossentropy": 2.5398319959640503, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.37523798644542694, "step": 478 }, { "epoch": 0.03, "grad_norm": 4.40625, "grad_norm_var": 0.14685872395833333, "learning_rate": 0.0001, "loss": 9.7299, "loss/crossentropy": 2.080340564250946, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.33192941546440125, "step": 480 }, { "epoch": 0.030125, "grad_norm": 4.46875, "grad_norm_var": 0.14511311848958333, "learning_rate": 0.0001, "loss": 9.6576, "loss/crossentropy": 2.2822307348251343, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.336023285984993, "step": 482 }, { "epoch": 0.03025, "grad_norm": 4.34375, "grad_norm_var": 0.1125, "learning_rate": 0.0001, "loss": 9.5694, "loss/crossentropy": 2.286174952983856, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3336353003978729, "step": 484 }, { "epoch": 0.030375, "grad_norm": 4.6875, "grad_norm_var": 0.13880208333333333, "learning_rate": 0.0001, "loss": 9.7847, "loss/crossentropy": 2.2369834184646606, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.32461969554424286, "step": 486 }, { "epoch": 0.0305, "grad_norm": 3.828125, "grad_norm_var": 0.16288960774739583, "learning_rate": 0.0001, "loss": 9.7289, "loss/crossentropy": 2.3086917400360107, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.321616530418396, "step": 488 }, { "epoch": 0.030625, "grad_norm": 4.21875, "grad_norm_var": 0.09160054524739583, "learning_rate": 0.0001, "loss": 9.8277, "loss/crossentropy": 2.3445401191711426, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3501063734292984, "step": 490 }, { "epoch": 0.03075, "grad_norm": 4.46875, "grad_norm_var": 0.0995513916015625, "learning_rate": 0.0001, "loss": 9.611, "loss/crossentropy": 1.9773722887039185, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.30824264883995056, "step": 492 }, { "epoch": 0.030875, "grad_norm": 4.25, "grad_norm_var": 0.09944559733072916, "learning_rate": 0.0001, "loss": 9.5735, "loss/crossentropy": 2.428261160850525, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.35596518218517303, "step": 494 }, { "epoch": 0.031, "grad_norm": 4.125, "grad_norm_var": 0.09492085774739584, "learning_rate": 0.0001, "loss": 9.7677, "loss/crossentropy": 2.262718915939331, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.33079805970191956, "step": 496 }, { "epoch": 0.031125, "grad_norm": 4.5, "grad_norm_var": 0.10596415201822916, "learning_rate": 0.0001, "loss": 9.7701, "loss/crossentropy": 2.3702725172042847, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3400324583053589, "step": 498 }, { "epoch": 0.03125, "grad_norm": 3.84375, "grad_norm_var": 0.13961588541666667, "learning_rate": 0.0001, "loss": 9.5602, "loss/crossentropy": 2.295218586921692, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3482416272163391, "step": 500 }, { "epoch": 0.031375, "grad_norm": 5.34375, "grad_norm_var": 0.15602213541666668, "learning_rate": 0.0001, "loss": 10.0544, "loss/crossentropy": 2.445479154586792, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.37152746319770813, "step": 502 }, { "epoch": 0.0315, "grad_norm": 4.9375, "grad_norm_var": 0.1950836181640625, "learning_rate": 0.0001, "loss": 9.5511, "loss/crossentropy": 2.223459005355835, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.32452794909477234, "step": 504 }, { "epoch": 0.031625, "grad_norm": 4.5625, "grad_norm_var": 0.19709370930989584, "learning_rate": 0.0001, "loss": 9.8003, "loss/crossentropy": 2.6400363445281982, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3358649015426636, "step": 506 }, { "epoch": 0.03175, "grad_norm": 4.1875, "grad_norm_var": 0.20300191243489582, "learning_rate": 0.0001, "loss": 9.7514, "loss/crossentropy": 2.548031210899353, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3170333355665207, "step": 508 }, { "epoch": 0.031875, "grad_norm": 4.8125, "grad_norm_var": 0.21568094889322917, "learning_rate": 0.0001, "loss": 9.8207, "loss/crossentropy": 2.2956899404525757, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3392469882965088, "step": 510 }, { "epoch": 0.032, "grad_norm": 4.34375, "grad_norm_var": 0.22424723307291666, "learning_rate": 0.0001, "loss": 9.6765, "loss/crossentropy": 2.353795349597931, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.33590464293956757, "step": 512 }, { "epoch": 0.032125, "grad_norm": 3.796875, "grad_norm_var": 0.23311258951822916, "learning_rate": 0.0001, "loss": 9.5705, "loss/crossentropy": 2.3682990074157715, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3214150220155716, "step": 514 }, { "epoch": 0.03225, "grad_norm": 4.375, "grad_norm_var": 0.19153238932291666, "learning_rate": 0.0001, "loss": 9.5203, "loss/crossentropy": 2.2625592947006226, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.35527725517749786, "step": 516 }, { "epoch": 0.032375, "grad_norm": 4.78125, "grad_norm_var": 0.16765950520833334, "learning_rate": 0.0001, "loss": 9.7962, "loss/crossentropy": 2.4448131322860718, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33070215582847595, "step": 518 }, { "epoch": 0.0325, "grad_norm": 3.875, "grad_norm_var": 0.143310546875, "learning_rate": 0.0001, "loss": 9.6837, "loss/crossentropy": 2.3028098344802856, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.34870584309101105, "step": 520 }, { "epoch": 0.032625, "grad_norm": 4.09375, "grad_norm_var": 0.16155497233072916, "learning_rate": 0.0001, "loss": 9.5805, "loss/crossentropy": 2.181080639362335, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.332154244184494, "step": 522 }, { "epoch": 0.03275, "grad_norm": 4.09375, "grad_norm_var": 0.1529937744140625, "learning_rate": 0.0001, "loss": 9.5179, "loss/crossentropy": 2.337011694908142, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.3419201970100403, "step": 524 }, { "epoch": 0.032875, "grad_norm": 4.5, "grad_norm_var": 0.3694976806640625, "learning_rate": 0.0001, "loss": 9.6365, "loss/crossentropy": 2.354939341545105, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.35222889482975006, "step": 526 }, { "epoch": 0.033, "grad_norm": 3.8125, "grad_norm_var": 0.41142476399739586, "learning_rate": 0.0001, "loss": 9.4276, "loss/crossentropy": 2.2241241931915283, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.32786163687705994, "step": 528 }, { "epoch": 0.033125, "grad_norm": 4.40625, "grad_norm_var": 0.3993886311848958, "learning_rate": 0.0001, "loss": 9.5245, "loss/crossentropy": 2.4617063999176025, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.36403751373291016, "step": 530 }, { "epoch": 0.03325, "grad_norm": 4.15625, "grad_norm_var": 0.4066396077473958, "learning_rate": 0.0001, "loss": 9.5201, "loss/crossentropy": 2.2278032302856445, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3158974349498749, "step": 532 }, { "epoch": 0.033375, "grad_norm": 4.6875, "grad_norm_var": 0.3785634358723958, "learning_rate": 0.0001, "loss": 9.6706, "loss/crossentropy": 2.464658737182617, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3708791136741638, "step": 534 }, { "epoch": 0.0335, "grad_norm": 4.25, "grad_norm_var": 0.34780985514322915, "learning_rate": 0.0001, "loss": 9.4853, "loss/crossentropy": 2.659584403038025, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3422502875328064, "step": 536 }, { "epoch": 0.033625, "grad_norm": 4.03125, "grad_norm_var": 0.3409830729166667, "learning_rate": 0.0001, "loss": 9.5004, "loss/crossentropy": 2.3510810136795044, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3363768756389618, "step": 538 }, { "epoch": 0.03375, "grad_norm": 4.0625, "grad_norm_var": 0.3513336181640625, "learning_rate": 0.0001, "loss": 9.6363, "loss/crossentropy": 2.4384061098098755, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3327721059322357, "step": 540 }, { "epoch": 0.033875, "grad_norm": 4.46875, "grad_norm_var": 0.08550516764322917, "learning_rate": 0.0001, "loss": 9.628, "loss/crossentropy": 2.435794949531555, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3326384872198105, "step": 542 }, { "epoch": 0.034, "grad_norm": 4.03125, "grad_norm_var": 0.06864827473958333, "learning_rate": 0.0001, "loss": 9.6422, "loss/crossentropy": 2.430909752845764, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33669474720954895, "step": 544 }, { "epoch": 0.034125, "grad_norm": 4.3125, "grad_norm_var": 0.06433817545572916, "learning_rate": 0.0001, "loss": 9.4545, "loss/crossentropy": 2.4339792728424072, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.33400970697402954, "step": 546 }, { "epoch": 0.03425, "grad_norm": 3.78125, "grad_norm_var": 0.060334269205729166, "learning_rate": 0.0001, "loss": 9.6066, "loss/crossentropy": 2.601755380630493, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3572891056537628, "step": 548 }, { "epoch": 0.034375, "grad_norm": 4.75, "grad_norm_var": 0.06419169108072917, "learning_rate": 0.0001, "loss": 9.5364, "loss/crossentropy": 2.263180732727051, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33265452086925507, "step": 550 }, { "epoch": 0.0345, "grad_norm": 4.21875, "grad_norm_var": 0.05921122233072917, "learning_rate": 0.0001, "loss": 9.4317, "loss/crossentropy": 2.6668169498443604, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3484792411327362, "step": 552 }, { "epoch": 0.034625, "grad_norm": 4.0, "grad_norm_var": 0.05729878743489583, "learning_rate": 0.0001, "loss": 9.5416, "loss/crossentropy": 2.488753318786621, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3451061546802521, "step": 554 }, { "epoch": 0.03475, "grad_norm": 4.0625, "grad_norm_var": 0.06444905598958334, "learning_rate": 0.0001, "loss": 9.3819, "loss/crossentropy": 2.5572561025619507, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3529306650161743, "step": 556 }, { "epoch": 0.034875, "grad_norm": 4.46875, "grad_norm_var": 0.06443684895833333, "learning_rate": 0.0001, "loss": 9.6456, "loss/crossentropy": 2.3274362087249756, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3618105351924896, "step": 558 }, { "epoch": 0.035, "grad_norm": 4.34375, "grad_norm_var": 0.059370930989583334, "learning_rate": 0.0001, "loss": 9.4529, "loss/crossentropy": 2.5755836963653564, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3374823033809662, "step": 560 }, { "epoch": 0.035125, "grad_norm": 3.640625, "grad_norm_var": 0.0757232666015625, "learning_rate": 0.0001, "loss": 9.5006, "loss/crossentropy": 2.291811466217041, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.33205731213092804, "step": 562 }, { "epoch": 0.03525, "grad_norm": 4.125, "grad_norm_var": 0.07857666015625, "learning_rate": 0.0001, "loss": 9.4801, "loss/crossentropy": 2.378532886505127, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.31333786249160767, "step": 564 }, { "epoch": 0.035375, "grad_norm": 4.46875, "grad_norm_var": 0.07685546875, "learning_rate": 0.0001, "loss": 9.425, "loss/crossentropy": 2.368729591369629, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.29723505675792694, "step": 566 }, { "epoch": 0.0355, "grad_norm": 3.890625, "grad_norm_var": 0.0939605712890625, "learning_rate": 0.0001, "loss": 9.3256, "loss/crossentropy": 2.359733462333679, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3121738135814667, "step": 568 }, { "epoch": 0.035625, "grad_norm": 4.9375, "grad_norm_var": 0.1553131103515625, "learning_rate": 0.0001, "loss": 9.4416, "loss/crossentropy": 2.315782904624939, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3432431221008301, "step": 570 }, { "epoch": 0.03575, "grad_norm": 3.3125, "grad_norm_var": 0.19507548014322917, "learning_rate": 0.0001, "loss": 9.5614, "loss/crossentropy": 2.3565382957458496, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.32801851630210876, "step": 572 }, { "epoch": 0.035875, "grad_norm": 4.25, "grad_norm_var": 0.19041239420572917, "learning_rate": 0.0001, "loss": 9.3502, "loss/crossentropy": 2.354498505592346, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.34335146844387054, "step": 574 }, { "epoch": 0.036, "grad_norm": 4.0625, "grad_norm_var": 0.1861968994140625, "learning_rate": 0.0001, "loss": 9.4591, "loss/crossentropy": 2.2208141088485718, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.32215404510498047, "step": 576 }, { "epoch": 0.036125, "grad_norm": 3.953125, "grad_norm_var": 0.17675374348958334, "learning_rate": 0.0001, "loss": 9.5684, "loss/crossentropy": 2.1560009717941284, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3317830264568329, "step": 578 }, { "epoch": 0.03625, "grad_norm": 3.84375, "grad_norm_var": 0.1708648681640625, "learning_rate": 0.0001, "loss": 9.384, "loss/crossentropy": 2.254258155822754, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.34166762232780457, "step": 580 }, { "epoch": 0.036375, "grad_norm": 3.90625, "grad_norm_var": 0.14008687337239584, "learning_rate": 0.0001, "loss": 9.4391, "loss/crossentropy": 2.193941831588745, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3121718168258667, "step": 582 }, { "epoch": 0.0365, "grad_norm": 4.1875, "grad_norm_var": 0.13092041015625, "learning_rate": 0.0001, "loss": 9.5948, "loss/crossentropy": 2.610209345817566, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.35641224682331085, "step": 584 }, { "epoch": 0.036625, "grad_norm": 3.9375, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 9.334, "loss/crossentropy": 2.173751473426819, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3093992620706558, "step": 586 }, { "epoch": 0.03675, "grad_norm": 3.734375, "grad_norm_var": 0.021141560872395833, "learning_rate": 0.0001, "loss": 9.3647, "loss/crossentropy": 2.6784013509750366, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3435199409723282, "step": 588 }, { "epoch": 0.036875, "grad_norm": 4.375, "grad_norm_var": 0.028416951497395832, "learning_rate": 0.0001, "loss": 9.6858, "loss/crossentropy": 2.5606144666671753, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3252936899662018, "step": 590 }, { "epoch": 0.037, "grad_norm": 4.28125, "grad_norm_var": 0.07421773274739583, "learning_rate": 0.0001, "loss": 9.1905, "loss/crossentropy": 2.5036474466323853, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3341420292854309, "step": 592 }, { "epoch": 0.037125, "grad_norm": 4.0, "grad_norm_var": 0.07280171712239583, "learning_rate": 0.0001, "loss": 9.2089, "loss/crossentropy": 2.138327479362488, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3032035082578659, "step": 594 }, { "epoch": 0.03725, "grad_norm": 3.296875, "grad_norm_var": 0.11728515625, "learning_rate": 0.0001, "loss": 9.1876, "loss/crossentropy": 2.0817145109176636, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.28961437940597534, "step": 596 }, { "epoch": 0.037375, "grad_norm": 4.40625, "grad_norm_var": 0.127685546875, "learning_rate": 0.0001, "loss": 9.5, "loss/crossentropy": 2.231162667274475, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.32542233169078827, "step": 598 }, { "epoch": 0.0375, "grad_norm": 4.09375, "grad_norm_var": 0.12625325520833333, "learning_rate": 0.0001, "loss": 9.4883, "loss/crossentropy": 2.279044270515442, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.30954092741012573, "step": 600 }, { "epoch": 0.037625, "grad_norm": 3.96875, "grad_norm_var": 0.12911783854166667, "learning_rate": 0.0001, "loss": 9.5667, "loss/crossentropy": 2.124338150024414, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3287663906812668, "step": 602 }, { "epoch": 0.03775, "grad_norm": 4.4375, "grad_norm_var": 0.1248443603515625, "learning_rate": 0.0001, "loss": 9.5844, "loss/crossentropy": 2.5788776874542236, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.34556926786899567, "step": 604 }, { "epoch": 0.037875, "grad_norm": 3.390625, "grad_norm_var": 0.15191650390625, "learning_rate": 0.0001, "loss": 9.4029, "loss/crossentropy": 2.511660575866699, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.31718502938747406, "step": 606 }, { "epoch": 0.038, "grad_norm": 4.6875, "grad_norm_var": 0.13528238932291667, "learning_rate": 0.0001, "loss": 9.4312, "loss/crossentropy": 2.558152675628662, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3450692296028137, "step": 608 }, { "epoch": 0.038125, "grad_norm": 3.921875, "grad_norm_var": 0.13547261555989584, "learning_rate": 0.0001, "loss": 9.2833, "loss/crossentropy": 2.2010965943336487, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.33724747598171234, "step": 610 }, { "epoch": 0.03825, "grad_norm": 3.90625, "grad_norm_var": 0.098974609375, "learning_rate": 0.0001, "loss": 9.4903, "loss/crossentropy": 2.499935030937195, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3286616951227188, "step": 612 }, { "epoch": 0.038375, "grad_norm": 4.0625, "grad_norm_var": 0.08684488932291666, "learning_rate": 0.0001, "loss": 9.3714, "loss/crossentropy": 2.29742568731308, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3321594297885895, "step": 614 }, { "epoch": 0.0385, "grad_norm": 4.4375, "grad_norm_var": 0.10472005208333333, "learning_rate": 0.0001, "loss": 9.2124, "loss/crossentropy": 2.2705591917037964, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.33022937178611755, "step": 616 }, { "epoch": 0.038625, "grad_norm": 3.78125, "grad_norm_var": 0.105615234375, "learning_rate": 0.0001, "loss": 9.3996, "loss/crossentropy": 2.5177834033966064, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.33882059156894684, "step": 618 }, { "epoch": 0.03875, "grad_norm": 3.9375, "grad_norm_var": 0.09058329264322916, "learning_rate": 0.0001, "loss": 9.3251, "loss/crossentropy": 2.3914138078689575, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.300808310508728, "step": 620 }, { "epoch": 0.038875, "grad_norm": 4.84375, "grad_norm_var": 0.107177734375, "learning_rate": 0.0001, "loss": 9.4366, "loss/crossentropy": 2.4114054441452026, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3568413257598877, "step": 622 }, { "epoch": 0.039, "grad_norm": 4.03125, "grad_norm_var": 0.08601786295572916, "learning_rate": 0.0001, "loss": 9.1418, "loss/crossentropy": 2.3419090509414673, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31717973947525024, "step": 624 }, { "epoch": 0.039125, "grad_norm": 4.25, "grad_norm_var": 0.09462890625, "learning_rate": 0.0001, "loss": 9.4373, "loss/crossentropy": 2.64203143119812, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.33858008682727814, "step": 626 }, { "epoch": 0.03925, "grad_norm": 3.625, "grad_norm_var": 0.10220947265625, "learning_rate": 0.0001, "loss": 9.3278, "loss/crossentropy": 2.0542389154434204, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.30543386936187744, "step": 628 }, { "epoch": 0.039375, "grad_norm": 3.875, "grad_norm_var": 0.13300679524739584, "learning_rate": 0.0001, "loss": 9.2397, "loss/crossentropy": 2.4575854539871216, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3102063983678818, "step": 630 }, { "epoch": 0.0395, "grad_norm": 4.28125, "grad_norm_var": 0.1220611572265625, "learning_rate": 0.0001, "loss": 9.3452, "loss/crossentropy": 2.3602949380874634, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.323737695813179, "step": 632 }, { "epoch": 0.039625, "grad_norm": 3.71875, "grad_norm_var": 0.12845052083333333, "learning_rate": 0.0001, "loss": 9.2681, "loss/crossentropy": 2.507497191429138, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3275406062602997, "step": 634 }, { "epoch": 0.03975, "grad_norm": 4.25, "grad_norm_var": 0.13587137858072917, "learning_rate": 0.0001, "loss": 9.36, "loss/crossentropy": 2.2765142917633057, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.31396952271461487, "step": 636 }, { "epoch": 0.039875, "grad_norm": 3.46875, "grad_norm_var": 0.10793863932291667, "learning_rate": 0.0001, "loss": 9.2294, "loss/crossentropy": 2.341191053390503, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.32686179876327515, "step": 638 }, { "epoch": 0.04, "grad_norm": 4.8125, "grad_norm_var": 0.1619781494140625, "learning_rate": 0.0001, "loss": 9.2556, "loss/crossentropy": 2.252098858356476, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3205975890159607, "step": 640 }, { "epoch": 0.040125, "grad_norm": 3.4375, "grad_norm_var": 0.20991109212239584, "learning_rate": 0.0001, "loss": 9.3137, "loss/crossentropy": 2.2994823455810547, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.34002968668937683, "step": 642 }, { "epoch": 0.04025, "grad_norm": 4.375, "grad_norm_var": 0.21840718587239583, "learning_rate": 0.0001, "loss": 9.1512, "loss/crossentropy": 2.5480741262435913, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.33601297438144684, "step": 644 }, { "epoch": 0.040375, "grad_norm": 3.578125, "grad_norm_var": 0.23454488118489583, "learning_rate": 0.0001, "loss": 9.3808, "loss/crossentropy": 2.524424910545349, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3395262509584427, "step": 646 }, { "epoch": 0.0405, "grad_norm": 3.75, "grad_norm_var": 0.23319905598958332, "learning_rate": 0.0001, "loss": 9.1816, "loss/crossentropy": 2.2198326587677, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.33485807478427887, "step": 648 }, { "epoch": 0.040625, "grad_norm": 4.125, "grad_norm_var": 0.23205973307291666, "learning_rate": 0.0001, "loss": 9.252, "loss/crossentropy": 2.5186063051223755, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.33504799008369446, "step": 650 }, { "epoch": 0.04075, "grad_norm": 3.5, "grad_norm_var": 0.23567606608072916, "learning_rate": 0.0001, "loss": 9.299, "loss/crossentropy": 2.3492661714553833, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.31913943588733673, "step": 652 }, { "epoch": 0.040875, "grad_norm": 4.1875, "grad_norm_var": 0.23007405598958333, "learning_rate": 0.0001, "loss": 9.2997, "loss/crossentropy": 2.600319981575012, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.32478684186935425, "step": 654 }, { "epoch": 0.041, "grad_norm": 3.859375, "grad_norm_var": 0.17136942545572917, "learning_rate": 0.0001, "loss": 9.1048, "loss/crossentropy": 2.335653781890869, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3278462737798691, "step": 656 }, { "epoch": 0.041125, "grad_norm": 3.640625, "grad_norm_var": 0.12332356770833333, "learning_rate": 0.0001, "loss": 9.2115, "loss/crossentropy": 2.223168969154358, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3341253995895386, "step": 658 }, { "epoch": 0.04125, "grad_norm": 5.3125, "grad_norm_var": 0.252490234375, "learning_rate": 0.0001, "loss": 9.1622, "loss/crossentropy": 2.424691677093506, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3088841736316681, "step": 660 }, { "epoch": 0.041375, "grad_norm": 4.625, "grad_norm_var": 0.2993072509765625, "learning_rate": 0.0001, "loss": 9.1041, "loss/crossentropy": 2.07004451751709, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2844501733779907, "step": 662 }, { "epoch": 0.0415, "grad_norm": 5.03125, "grad_norm_var": 0.3312459309895833, "learning_rate": 0.0001, "loss": 9.3562, "loss/crossentropy": 2.32711398601532, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3303475081920624, "step": 664 }, { "epoch": 0.041625, "grad_norm": 4.3125, "grad_norm_var": 0.48313700358072914, "learning_rate": 0.0001, "loss": 9.2399, "loss/crossentropy": 2.3355051279067993, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.34253838658332825, "step": 666 }, { "epoch": 0.04175, "grad_norm": 4.0, "grad_norm_var": 0.504443359375, "learning_rate": 0.0001, "loss": 9.3712, "loss/crossentropy": 2.423375368118286, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31640373170375824, "step": 668 }, { "epoch": 0.041875, "grad_norm": 3.8125, "grad_norm_var": 0.49081624348958336, "learning_rate": 0.0001, "loss": 9.1568, "loss/crossentropy": 2.4355897903442383, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3098563849925995, "step": 670 }, { "epoch": 0.042, "grad_norm": 3.65625, "grad_norm_var": 0.5179433186848958, "learning_rate": 0.0001, "loss": 9.0687, "loss/crossentropy": 2.151113748550415, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.28571945428848267, "step": 672 }, { "epoch": 0.042125, "grad_norm": 4.09375, "grad_norm_var": 0.45859375, "learning_rate": 0.0001, "loss": 9.126, "loss/crossentropy": 2.1033096313476562, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30395573377609253, "step": 674 }, { "epoch": 0.04225, "grad_norm": 3.703125, "grad_norm_var": 0.41311848958333336, "learning_rate": 0.0001, "loss": 9.2339, "loss/crossentropy": 2.384363532066345, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3477473706007004, "step": 676 }, { "epoch": 0.042375, "grad_norm": 4.0, "grad_norm_var": 0.35420633951822916, "learning_rate": 0.0001, "loss": 9.2017, "loss/crossentropy": 2.3887627124786377, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3319309651851654, "step": 678 }, { "epoch": 0.0425, "grad_norm": 3.796875, "grad_norm_var": 0.28951416015625, "learning_rate": 0.0001, "loss": 9.0506, "loss/crossentropy": 2.3131089210510254, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31970134377479553, "step": 680 }, { "epoch": 0.042625, "grad_norm": 3.515625, "grad_norm_var": 0.0540191650390625, "learning_rate": 0.0001, "loss": 9.3033, "loss/crossentropy": 2.2213594913482666, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.2932916283607483, "step": 682 }, { "epoch": 0.04275, "grad_norm": 3.9375, "grad_norm_var": 0.04342041015625, "learning_rate": 0.0001, "loss": 9.1272, "loss/crossentropy": 2.343689441680908, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31054770946502686, "step": 684 }, { "epoch": 0.042875, "grad_norm": 4.125, "grad_norm_var": 0.07541910807291667, "learning_rate": 0.0001, "loss": 9.0333, "loss/crossentropy": 2.426623225212097, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.30797363817691803, "step": 686 }, { "epoch": 0.043, "grad_norm": 3.75, "grad_norm_var": 0.07333577473958333, "learning_rate": 0.0001, "loss": 9.1961, "loss/crossentropy": 2.1243041157722473, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.27504249662160873, "step": 688 }, { "epoch": 0.043125, "grad_norm": 3.546875, "grad_norm_var": 0.07789306640625, "learning_rate": 0.0001, "loss": 9.3228, "loss/crossentropy": 2.2858647108078003, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3059113025665283, "step": 690 }, { "epoch": 0.04325, "grad_norm": 4.03125, "grad_norm_var": 0.07827046712239584, "learning_rate": 0.0001, "loss": 9.2223, "loss/crossentropy": 2.6258697509765625, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32897868752479553, "step": 692 }, { "epoch": 0.043375, "grad_norm": 4.15625, "grad_norm_var": 0.0829986572265625, "learning_rate": 0.0001, "loss": 9.0107, "loss/crossentropy": 2.3375871181488037, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.30802060663700104, "step": 694 }, { "epoch": 0.0435, "grad_norm": 3.96875, "grad_norm_var": 0.08065999348958333, "learning_rate": 0.0001, "loss": 9.1397, "loss/crossentropy": 2.27328884601593, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32508768141269684, "step": 696 }, { "epoch": 0.043625, "grad_norm": 3.90625, "grad_norm_var": 0.07095947265625, "learning_rate": 0.0001, "loss": 9.0871, "loss/crossentropy": 2.3383896350860596, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3195807486772537, "step": 698 }, { "epoch": 0.04375, "grad_norm": 3.609375, "grad_norm_var": 0.07858784993489583, "learning_rate": 0.0001, "loss": 9.0647, "loss/crossentropy": 1.985029935836792, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2951173782348633, "step": 700 }, { "epoch": 0.043875, "grad_norm": 3.5625, "grad_norm_var": 0.04241129557291667, "learning_rate": 0.0001, "loss": 9.2963, "loss/crossentropy": 2.236124038696289, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3033126890659332, "step": 702 }, { "epoch": 0.044, "grad_norm": 3.9375, "grad_norm_var": 0.03899332682291667, "learning_rate": 0.0001, "loss": 9.4002, "loss/crossentropy": 2.3327068090438843, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.35688331723213196, "step": 704 }, { "epoch": 0.044125, "grad_norm": 3.40625, "grad_norm_var": 0.04415690104166667, "learning_rate": 0.0001, "loss": 9.1288, "loss/crossentropy": 2.185292422771454, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2997971922159195, "step": 706 }, { "epoch": 0.04425, "grad_norm": 4.03125, "grad_norm_var": 0.048193359375, "learning_rate": 0.0001, "loss": 9.1816, "loss/crossentropy": 2.592350959777832, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3380052447319031, "step": 708 }, { "epoch": 0.044375, "grad_norm": 3.625, "grad_norm_var": 0.050837198893229164, "learning_rate": 0.0001, "loss": 9.2751, "loss/crossentropy": 2.3508870601654053, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.30894704163074493, "step": 710 }, { "epoch": 0.0445, "grad_norm": 3.703125, "grad_norm_var": 0.05436909993489583, "learning_rate": 0.0001, "loss": 9.2454, "loss/crossentropy": 2.4968451261520386, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3285796344280243, "step": 712 }, { "epoch": 0.044625, "grad_norm": 3.90625, "grad_norm_var": 0.10283203125, "learning_rate": 0.0001, "loss": 9.3351, "loss/crossentropy": 2.4106976985931396, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.32071977853775024, "step": 714 }, { "epoch": 0.04475, "grad_norm": 3.609375, "grad_norm_var": 0.09908447265625, "learning_rate": 0.0001, "loss": 9.0656, "loss/crossentropy": 2.2552783489227295, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.30507735908031464, "step": 716 }, { "epoch": 0.044875, "grad_norm": 3.96875, "grad_norm_var": 0.09814046223958334, "learning_rate": 0.0001, "loss": 9.0835, "loss/crossentropy": 2.4068862199783325, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31281837821006775, "step": 718 }, { "epoch": 0.045, "grad_norm": 3.75, "grad_norm_var": 0.17351888020833334, "learning_rate": 0.0001, "loss": 9.2535, "loss/crossentropy": 2.2698957920074463, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3157753646373749, "step": 720 }, { "epoch": 0.045125, "grad_norm": 4.25, "grad_norm_var": 0.25741780598958336, "learning_rate": 0.0001, "loss": 9.2178, "loss/crossentropy": 2.5466257333755493, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3196914494037628, "step": 722 }, { "epoch": 0.04525, "grad_norm": 3.75, "grad_norm_var": 0.2650553385416667, "learning_rate": 0.0001, "loss": 9.0596, "loss/crossentropy": 2.259085774421692, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30355823040008545, "step": 724 }, { "epoch": 0.045375, "grad_norm": 3.828125, "grad_norm_var": 0.2575836181640625, "learning_rate": 0.0001, "loss": 9.3315, "loss/crossentropy": 2.420317769050598, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3066561073064804, "step": 726 }, { "epoch": 0.0455, "grad_norm": 3.65625, "grad_norm_var": 0.24367574055989583, "learning_rate": 0.0001, "loss": 9.074, "loss/crossentropy": 2.244703531265259, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.30555886030197144, "step": 728 }, { "epoch": 0.045625, "grad_norm": 3.875, "grad_norm_var": 0.21819254557291667, "learning_rate": 0.0001, "loss": 9.1535, "loss/crossentropy": 2.3010120391845703, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30051296949386597, "step": 730 }, { "epoch": 0.04575, "grad_norm": 3.8125, "grad_norm_var": 0.2000885009765625, "learning_rate": 0.0001, "loss": 9.2806, "loss/crossentropy": 2.0744789838790894, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3020750731229782, "step": 732 }, { "epoch": 0.045875, "grad_norm": 4.875, "grad_norm_var": 0.2525390625, "learning_rate": 0.0001, "loss": 9.1755, "loss/crossentropy": 2.4247848987579346, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30639201402664185, "step": 734 }, { "epoch": 0.046, "grad_norm": 3.6875, "grad_norm_var": 0.21112874348958333, "learning_rate": 0.0001, "loss": 9.2127, "loss/crossentropy": 2.228127598762512, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2969563454389572, "step": 736 }, { "epoch": 0.046125, "grad_norm": 3.453125, "grad_norm_var": 0.12919514973958332, "learning_rate": 0.0001, "loss": 8.998, "loss/crossentropy": 2.3098256587982178, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.30860866606235504, "step": 738 }, { "epoch": 0.04625, "grad_norm": 3.46875, "grad_norm_var": 0.14089253743489583, "learning_rate": 0.0001, "loss": 8.9791, "loss/crossentropy": 2.135189712047577, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3053634464740753, "step": 740 }, { "epoch": 0.046375, "grad_norm": 3.71875, "grad_norm_var": 0.14488016764322917, "learning_rate": 0.0001, "loss": 9.0268, "loss/crossentropy": 2.4625048637390137, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3120746314525604, "step": 742 }, { "epoch": 0.0465, "grad_norm": 3.734375, "grad_norm_var": 0.142333984375, "learning_rate": 0.0001, "loss": 9.1708, "loss/crossentropy": 2.1217297315597534, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2820632755756378, "step": 744 }, { "epoch": 0.046625, "grad_norm": 3.453125, "grad_norm_var": 0.16352437337239584, "learning_rate": 0.0001, "loss": 8.8857, "loss/crossentropy": 2.454026937484741, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.28094005584716797, "step": 746 }, { "epoch": 0.04675, "grad_norm": 4.5, "grad_norm_var": 0.19820556640625, "learning_rate": 0.0001, "loss": 9.1644, "loss/crossentropy": 2.413946032524109, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3286159932613373, "step": 748 }, { "epoch": 0.046875, "grad_norm": 3.265625, "grad_norm_var": 0.12294514973958333, "learning_rate": 0.0001, "loss": 8.881, "loss/crossentropy": 2.2393068075180054, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30563026666641235, "step": 750 }, { "epoch": 0.047, "grad_norm": 3.25, "grad_norm_var": 0.11814676920572917, "learning_rate": 0.0001, "loss": 8.8969, "loss/crossentropy": 2.2508221864700317, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2862202823162079, "step": 752 }, { "epoch": 0.047125, "grad_norm": 4.03125, "grad_norm_var": 0.13244527180989582, "learning_rate": 0.0001, "loss": 9.265, "loss/crossentropy": 2.1571661233901978, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3307356685400009, "step": 754 }, { "epoch": 0.04725, "grad_norm": 3.34375, "grad_norm_var": 0.13481343587239583, "learning_rate": 0.0001, "loss": 8.9922, "loss/crossentropy": 2.393697500228882, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3230316936969757, "step": 756 }, { "epoch": 0.047375, "grad_norm": 3.640625, "grad_norm_var": 0.13456624348958332, "learning_rate": 0.0001, "loss": 8.9217, "loss/crossentropy": 2.249446392059326, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2918042242527008, "step": 758 }, { "epoch": 0.0475, "grad_norm": 3.75, "grad_norm_var": 0.13603413899739583, "learning_rate": 0.0001, "loss": 9.0298, "loss/crossentropy": 2.4479427337646484, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3056575655937195, "step": 760 }, { "epoch": 0.047625, "grad_norm": 3.75, "grad_norm_var": 0.12724609375, "learning_rate": 0.0001, "loss": 9.2332, "loss/crossentropy": 2.1675299406051636, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.303151935338974, "step": 762 }, { "epoch": 0.04775, "grad_norm": 3.96875, "grad_norm_var": 0.08153889973958334, "learning_rate": 0.0001, "loss": 9.0754, "loss/crossentropy": 2.5868079662323, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.2945869415998459, "step": 764 }, { "epoch": 0.047875, "grad_norm": 3.609375, "grad_norm_var": 0.07388916015625, "learning_rate": 0.0001, "loss": 9.141, "loss/crossentropy": 2.535553216934204, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.32601119577884674, "step": 766 }, { "epoch": 0.048, "grad_norm": 4.0625, "grad_norm_var": 0.05016276041666667, "learning_rate": 0.0001, "loss": 9.2516, "loss/crossentropy": 2.5762476921081543, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.34192636609077454, "step": 768 }, { "epoch": 0.048125, "grad_norm": 3.921875, "grad_norm_var": 0.0440582275390625, "learning_rate": 0.0001, "loss": 9.0968, "loss/crossentropy": 2.402553081512451, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3043902814388275, "step": 770 }, { "epoch": 0.04825, "grad_norm": 3.3125, "grad_norm_var": 0.042801920572916666, "learning_rate": 0.0001, "loss": 9.0553, "loss/crossentropy": 2.4971920251846313, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30807921290397644, "step": 772 }, { "epoch": 0.048375, "grad_norm": 3.625, "grad_norm_var": 0.043843587239583336, "learning_rate": 0.0001, "loss": 9.1797, "loss/crossentropy": 2.2825552225112915, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31863027811050415, "step": 774 }, { "epoch": 0.0485, "grad_norm": 3.625, "grad_norm_var": 0.04838765462239583, "learning_rate": 0.0001, "loss": 8.9698, "loss/crossentropy": 2.516672134399414, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.31801968812942505, "step": 776 }, { "epoch": 0.048625, "grad_norm": 4.125, "grad_norm_var": 0.06825764973958333, "learning_rate": 0.0001, "loss": 8.9749, "loss/crossentropy": 2.495086908340454, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.31983429193496704, "step": 778 }, { "epoch": 0.04875, "grad_norm": 4.28125, "grad_norm_var": 0.12460530598958333, "learning_rate": 0.0001, "loss": 9.1932, "loss/crossentropy": 2.3892232179641724, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3304327577352524, "step": 780 }, { "epoch": 0.048875, "grad_norm": 3.3125, "grad_norm_var": 0.17346598307291666, "learning_rate": 0.0001, "loss": 9.0205, "loss/crossentropy": 2.4275970458984375, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.29281261563301086, "step": 782 }, { "epoch": 0.049, "grad_norm": 3.90625, "grad_norm_var": 0.16988525390625, "learning_rate": 0.0001, "loss": 9.0345, "loss/crossentropy": 2.147684335708618, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30130012333393097, "step": 784 }, { "epoch": 0.049125, "grad_norm": 3.53125, "grad_norm_var": 0.1740234375, "learning_rate": 0.0001, "loss": 9.1809, "loss/crossentropy": 2.387849450111389, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.32018375396728516, "step": 786 }, { "epoch": 0.04925, "grad_norm": 3.59375, "grad_norm_var": 0.16553446451822917, "learning_rate": 0.0001, "loss": 8.9704, "loss/crossentropy": 2.1901475191116333, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.2830745130777359, "step": 788 }, { "epoch": 0.049375, "grad_norm": 3.390625, "grad_norm_var": 0.17476806640625, "learning_rate": 0.0001, "loss": 9.0056, "loss/crossentropy": 2.476477026939392, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30152270197868347, "step": 790 }, { "epoch": 0.0495, "grad_norm": 3.75, "grad_norm_var": 0.16788736979166666, "learning_rate": 0.0001, "loss": 9.2374, "loss/crossentropy": 2.4895143508911133, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.32273176312446594, "step": 792 }, { "epoch": 0.049625, "grad_norm": 3.390625, "grad_norm_var": 0.16460673014322916, "learning_rate": 0.0001, "loss": 8.8151, "loss/crossentropy": 2.1444047689437866, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2969019412994385, "step": 794 }, { "epoch": 0.04975, "grad_norm": 3.390625, "grad_norm_var": 0.06581624348958333, "learning_rate": 0.0001, "loss": 9.0359, "loss/crossentropy": 2.314146399497986, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.297823429107666, "step": 796 }, { "epoch": 0.049875, "grad_norm": 3.484375, "grad_norm_var": 0.046223958333333336, "learning_rate": 0.0001, "loss": 8.9617, "loss/crossentropy": 2.1535879373550415, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.29098525643348694, "step": 798 }, { "epoch": 0.05, "grad_norm": 3.59375, "grad_norm_var": 0.04029032389322917, "learning_rate": 0.0001, "loss": 8.9443, "loss/crossentropy": 2.5792254209518433, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30909422039985657, "step": 800 }, { "epoch": 0.050125, "grad_norm": 3.671875, "grad_norm_var": 0.041258748372395834, "learning_rate": 0.0001, "loss": 8.9077, "loss/crossentropy": 2.387328624725342, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.35100018978118896, "step": 802 }, { "epoch": 0.05025, "grad_norm": 3.484375, "grad_norm_var": 0.0410308837890625, "learning_rate": 0.0001, "loss": 8.9889, "loss/crossentropy": 2.5987643003463745, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2944849133491516, "step": 804 }, { "epoch": 0.050375, "grad_norm": 3.46875, "grad_norm_var": 0.03965555826822917, "learning_rate": 0.0001, "loss": 8.688, "loss/crossentropy": 2.291478753089905, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3056093603372574, "step": 806 }, { "epoch": 0.0505, "grad_norm": 3.6875, "grad_norm_var": 0.030744425455729165, "learning_rate": 0.0001, "loss": 8.8988, "loss/crossentropy": 2.281537890434265, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31349045038223267, "step": 808 }, { "epoch": 0.050625, "grad_norm": 3.546875, "grad_norm_var": 0.026634724934895833, "learning_rate": 0.0001, "loss": 9.0642, "loss/crossentropy": 2.16494482755661, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.307197168469429, "step": 810 }, { "epoch": 0.05075, "grad_norm": 3.75, "grad_norm_var": 0.019953409830729168, "learning_rate": 0.0001, "loss": 8.9202, "loss/crossentropy": 2.296713352203369, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30054841935634613, "step": 812 }, { "epoch": 0.050875, "grad_norm": 3.640625, "grad_norm_var": 0.021012369791666666, "learning_rate": 0.0001, "loss": 8.8666, "loss/crossentropy": 2.0488401055336, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2681543007493019, "step": 814 }, { "epoch": 0.051, "grad_norm": 3.75, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 8.9113, "loss/crossentropy": 2.1317135095596313, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2823093831539154, "step": 816 }, { "epoch": 0.051125, "grad_norm": 3.640625, "grad_norm_var": 0.03144124348958333, "learning_rate": 0.0001, "loss": 8.9404, "loss/crossentropy": 2.3469539880752563, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30824559926986694, "step": 818 }, { "epoch": 0.05125, "grad_norm": 3.921875, "grad_norm_var": 0.05701395670572917, "learning_rate": 0.0001, "loss": 9.108, "loss/crossentropy": 2.4571563005447388, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31656327843666077, "step": 820 }, { "epoch": 0.051375, "grad_norm": 3.65625, "grad_norm_var": 0.053141276041666664, "learning_rate": 0.0001, "loss": 9.0717, "loss/crossentropy": 2.3994951248168945, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3029633164405823, "step": 822 }, { "epoch": 0.0515, "grad_norm": 3.6875, "grad_norm_var": 0.051985677083333334, "learning_rate": 0.0001, "loss": 8.9063, "loss/crossentropy": 2.2581586837768555, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30481448769569397, "step": 824 }, { "epoch": 0.051625, "grad_norm": 3.125, "grad_norm_var": 0.07155659993489584, "learning_rate": 0.0001, "loss": 8.8898, "loss/crossentropy": 2.071722984313965, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2941209524869919, "step": 826 }, { "epoch": 0.05175, "grad_norm": 3.625, "grad_norm_var": 0.0892578125, "learning_rate": 0.0001, "loss": 9.0358, "loss/crossentropy": 2.5443246364593506, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29996325075626373, "step": 828 }, { "epoch": 0.051875, "grad_norm": 3.40625, "grad_norm_var": 0.09868062337239583, "learning_rate": 0.0001, "loss": 8.9568, "loss/crossentropy": 2.384023070335388, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31264999508857727, "step": 830 }, { "epoch": 0.052, "grad_norm": 3.953125, "grad_norm_var": 0.0974609375, "learning_rate": 0.0001, "loss": 8.9013, "loss/crossentropy": 2.4286372661590576, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.32337459921836853, "step": 832 }, { "epoch": 0.052125, "grad_norm": 3.71875, "grad_norm_var": 0.08315327962239584, "learning_rate": 0.0001, "loss": 8.7277, "loss/crossentropy": 2.1722983717918396, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3173587769269943, "step": 834 }, { "epoch": 0.05225, "grad_norm": 4.03125, "grad_norm_var": 0.07082417805989584, "learning_rate": 0.0001, "loss": 9.0322, "loss/crossentropy": 2.6337625980377197, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.30448000133037567, "step": 836 }, { "epoch": 0.052375, "grad_norm": 3.53125, "grad_norm_var": 0.0727203369140625, "learning_rate": 0.0001, "loss": 8.9959, "loss/crossentropy": 2.303470253944397, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.318126916885376, "step": 838 }, { "epoch": 0.0525, "grad_norm": 3.734375, "grad_norm_var": 0.0729888916015625, "learning_rate": 0.0001, "loss": 8.7451, "loss/crossentropy": 2.295042037963867, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.30773746967315674, "step": 840 }, { "epoch": 0.052625, "grad_norm": 3.0625, "grad_norm_var": 0.0851226806640625, "learning_rate": 0.0001, "loss": 8.6922, "loss/crossentropy": 2.060616612434387, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.29626762866973877, "step": 842 }, { "epoch": 0.05275, "grad_norm": 3.671875, "grad_norm_var": 0.070556640625, "learning_rate": 0.0001, "loss": 8.9668, "loss/crossentropy": 2.2909332513809204, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.2921592891216278, "step": 844 }, { "epoch": 0.052875, "grad_norm": 4.21875, "grad_norm_var": 0.092822265625, "learning_rate": 0.0001, "loss": 9.1657, "loss/crossentropy": 2.5100677013397217, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3154117166996002, "step": 846 }, { "epoch": 0.053, "grad_norm": 3.546875, "grad_norm_var": 0.09158528645833333, "learning_rate": 0.0001, "loss": 9.1623, "loss/crossentropy": 2.3510499000549316, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.28174377977848053, "step": 848 }, { "epoch": 0.053125, "grad_norm": 3.5625, "grad_norm_var": 0.08963216145833333, "learning_rate": 0.0001, "loss": 8.8411, "loss/crossentropy": 2.361242413520813, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2962343841791153, "step": 850 }, { "epoch": 0.05325, "grad_norm": 3.40625, "grad_norm_var": 0.08534749348958333, "learning_rate": 0.0001, "loss": 8.8426, "loss/crossentropy": 1.9918023943901062, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.26788248866796494, "step": 852 }, { "epoch": 0.053375, "grad_norm": 3.25, "grad_norm_var": 0.0908111572265625, "learning_rate": 0.0001, "loss": 8.9666, "loss/crossentropy": 2.6088958978652954, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3181132972240448, "step": 854 }, { "epoch": 0.0535, "grad_norm": 3.8125, "grad_norm_var": 0.09516499837239584, "learning_rate": 0.0001, "loss": 8.8605, "loss/crossentropy": 2.1732386350631714, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3017688989639282, "step": 856 }, { "epoch": 0.053625, "grad_norm": 3.578125, "grad_norm_var": 0.080029296875, "learning_rate": 0.0001, "loss": 9.0765, "loss/crossentropy": 2.3053905963897705, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28416720032691956, "step": 858 }, { "epoch": 0.05375, "grad_norm": 3.53125, "grad_norm_var": 0.08489481608072917, "learning_rate": 0.0001, "loss": 8.9506, "loss/crossentropy": 2.3922606706619263, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3041910231113434, "step": 860 }, { "epoch": 0.053875, "grad_norm": 3.625, "grad_norm_var": 0.059326171875, "learning_rate": 0.0001, "loss": 8.9744, "loss/crossentropy": 2.2312803864479065, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.28423628211021423, "step": 862 }, { "epoch": 0.054, "grad_norm": 3.5625, "grad_norm_var": 0.053938802083333334, "learning_rate": 0.0001, "loss": 8.8504, "loss/crossentropy": 2.4400585889816284, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32312142848968506, "step": 864 }, { "epoch": 0.054125, "grad_norm": 3.453125, "grad_norm_var": 0.06122639973958333, "learning_rate": 0.0001, "loss": 8.7439, "loss/crossentropy": 2.16322124004364, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3096587359905243, "step": 866 }, { "epoch": 0.05425, "grad_norm": 3.59375, "grad_norm_var": 0.05967508951822917, "learning_rate": 0.0001, "loss": 8.7824, "loss/crossentropy": 2.2714940309524536, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.31225262582302094, "step": 868 }, { "epoch": 0.054375, "grad_norm": 3.59375, "grad_norm_var": 0.061335245768229164, "learning_rate": 0.0001, "loss": 8.9188, "loss/crossentropy": 2.3233022689819336, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.29183535277843475, "step": 870 }, { "epoch": 0.0545, "grad_norm": 3.46875, "grad_norm_var": 0.060628255208333336, "learning_rate": 0.0001, "loss": 8.8018, "loss/crossentropy": 2.1473275423049927, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.302778959274292, "step": 872 }, { "epoch": 0.054625, "grad_norm": 3.5, "grad_norm_var": 0.04723307291666667, "learning_rate": 0.0001, "loss": 8.7291, "loss/crossentropy": 2.10613477230072, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2919985055923462, "step": 874 }, { "epoch": 0.05475, "grad_norm": 3.828125, "grad_norm_var": 0.04791259765625, "learning_rate": 0.0001, "loss": 8.8805, "loss/crossentropy": 2.391135811805725, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.303908035159111, "step": 876 }, { "epoch": 0.054875, "grad_norm": 4.21875, "grad_norm_var": 0.07213134765625, "learning_rate": 0.0001, "loss": 8.9757, "loss/crossentropy": 2.191763758659363, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29602205753326416, "step": 878 }, { "epoch": 0.055, "grad_norm": 3.09375, "grad_norm_var": 0.0880859375, "learning_rate": 0.0001, "loss": 8.9226, "loss/crossentropy": 2.3897584676742554, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2984778434038162, "step": 880 }, { "epoch": 0.055125, "grad_norm": 3.53125, "grad_norm_var": 0.10453999837239583, "learning_rate": 0.0001, "loss": 8.6909, "loss/crossentropy": 2.2274895906448364, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3162970691919327, "step": 882 }, { "epoch": 0.05525, "grad_norm": 3.5, "grad_norm_var": 0.10286051432291667, "learning_rate": 0.0001, "loss": 8.7805, "loss/crossentropy": 2.238261342048645, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3091089427471161, "step": 884 }, { "epoch": 0.055375, "grad_norm": 3.578125, "grad_norm_var": 0.09251200358072917, "learning_rate": 0.0001, "loss": 8.8335, "loss/crossentropy": 2.398587703704834, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30020007491111755, "step": 886 }, { "epoch": 0.0555, "grad_norm": 3.125, "grad_norm_var": 0.11856180826822917, "learning_rate": 0.0001, "loss": 8.6695, "loss/crossentropy": 2.362698554992676, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3049694448709488, "step": 888 }, { "epoch": 0.055625, "grad_norm": 4.09375, "grad_norm_var": 0.13857014973958334, "learning_rate": 0.0001, "loss": 8.8959, "loss/crossentropy": 2.483735680580139, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3071689158678055, "step": 890 }, { "epoch": 0.05575, "grad_norm": 3.734375, "grad_norm_var": 0.1455230712890625, "learning_rate": 0.0001, "loss": 9.1407, "loss/crossentropy": 2.6666314601898193, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3311910331249237, "step": 892 }, { "epoch": 0.055875, "grad_norm": 3.703125, "grad_norm_var": 0.1199615478515625, "learning_rate": 0.0001, "loss": 8.8907, "loss/crossentropy": 2.3543641567230225, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30659469962120056, "step": 894 }, { "epoch": 0.056, "grad_norm": 3.546875, "grad_norm_var": 0.11165364583333333, "learning_rate": 0.0001, "loss": 8.542, "loss/crossentropy": 2.442333459854126, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2963842451572418, "step": 896 }, { "epoch": 0.056125, "grad_norm": 3.25, "grad_norm_var": 0.09463602701822917, "learning_rate": 0.0001, "loss": 8.8421, "loss/crossentropy": 2.5169384479522705, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31081072986125946, "step": 898 }, { "epoch": 0.05625, "grad_norm": 3.609375, "grad_norm_var": 0.09204813639322916, "learning_rate": 0.0001, "loss": 8.8349, "loss/crossentropy": 2.378996729850769, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2988849878311157, "step": 900 }, { "epoch": 0.056375, "grad_norm": 4.0, "grad_norm_var": 0.10779520670572916, "learning_rate": 0.0001, "loss": 8.6611, "loss/crossentropy": 2.0278642177581787, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.26663029193878174, "step": 902 }, { "epoch": 0.0565, "grad_norm": 3.625, "grad_norm_var": 0.08515218098958334, "learning_rate": 0.0001, "loss": 8.8614, "loss/crossentropy": 2.3273751735687256, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2889983803033829, "step": 904 }, { "epoch": 0.056625, "grad_norm": 3.46875, "grad_norm_var": 0.07197265625, "learning_rate": 0.0001, "loss": 8.8386, "loss/crossentropy": 2.0480875372886658, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30192601680755615, "step": 906 }, { "epoch": 0.05675, "grad_norm": 3.859375, "grad_norm_var": 0.05976155598958333, "learning_rate": 0.0001, "loss": 9.0464, "loss/crossentropy": 2.5360888242721558, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.28871724009513855, "step": 908 }, { "epoch": 0.056875, "grad_norm": 3.25, "grad_norm_var": 0.060846964518229164, "learning_rate": 0.0001, "loss": 8.9657, "loss/crossentropy": 2.59428608417511, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3197096735239029, "step": 910 }, { "epoch": 0.057, "grad_norm": 3.46875, "grad_norm_var": 0.062027994791666666, "learning_rate": 0.0001, "loss": 8.8488, "loss/crossentropy": 2.4978867769241333, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.38725124299526215, "step": 912 }, { "epoch": 0.057125, "grad_norm": 3.640625, "grad_norm_var": 0.060347493489583334, "learning_rate": 0.0001, "loss": 9.0475, "loss/crossentropy": 2.269066333770752, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2860633432865143, "step": 914 }, { "epoch": 0.05725, "grad_norm": 3.03125, "grad_norm_var": 0.07659505208333334, "learning_rate": 0.0001, "loss": 8.6945, "loss/crossentropy": 2.3426260948181152, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2847317010164261, "step": 916 }, { "epoch": 0.057375, "grad_norm": 3.53125, "grad_norm_var": 0.055562337239583336, "learning_rate": 0.0001, "loss": 8.7413, "loss/crossentropy": 2.3890886306762695, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2951260507106781, "step": 918 }, { "epoch": 0.0575, "grad_norm": 3.46875, "grad_norm_var": 0.05225321451822917, "learning_rate": 0.0001, "loss": 8.726, "loss/crossentropy": 2.324913740158081, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2819140702486038, "step": 920 }, { "epoch": 0.057625, "grad_norm": 4.375, "grad_norm_var": 0.10100809733072917, "learning_rate": 0.0001, "loss": 8.8514, "loss/crossentropy": 2.4282515048980713, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3256274312734604, "step": 922 }, { "epoch": 0.05775, "grad_norm": 3.46875, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 8.6487, "loss/crossentropy": 2.476005434989929, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3199878931045532, "step": 924 }, { "epoch": 0.057875, "grad_norm": 3.71875, "grad_norm_var": 0.13284098307291667, "learning_rate": 0.0001, "loss": 8.8988, "loss/crossentropy": 2.469366192817688, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2891136407852173, "step": 926 }, { "epoch": 0.058, "grad_norm": 3.234375, "grad_norm_var": 0.14055989583333334, "learning_rate": 0.0001, "loss": 8.7523, "loss/crossentropy": 2.348281979560852, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.27692942321300507, "step": 928 }, { "epoch": 0.058125, "grad_norm": 3.515625, "grad_norm_var": 0.14073893229166667, "learning_rate": 0.0001, "loss": 8.5853, "loss/crossentropy": 2.1680904626846313, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.29489417374134064, "step": 930 }, { "epoch": 0.05825, "grad_norm": 3.65625, "grad_norm_var": 0.116650390625, "learning_rate": 0.0001, "loss": 8.7008, "loss/crossentropy": 2.139521837234497, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.263563334941864, "step": 932 }, { "epoch": 0.058375, "grad_norm": 3.265625, "grad_norm_var": 0.13267822265625, "learning_rate": 0.0001, "loss": 8.8043, "loss/crossentropy": 2.522794246673584, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3044002056121826, "step": 934 }, { "epoch": 0.0585, "grad_norm": 3.765625, "grad_norm_var": 0.12968343098958332, "learning_rate": 0.0001, "loss": 8.9035, "loss/crossentropy": 2.4657033681869507, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.30104976892471313, "step": 936 }, { "epoch": 0.058625, "grad_norm": 5.53125, "grad_norm_var": 0.3282389322916667, "learning_rate": 0.0001, "loss": 8.9486, "loss/crossentropy": 2.3463146686553955, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.303235799074173, "step": 938 }, { "epoch": 0.05875, "grad_norm": 3.515625, "grad_norm_var": 0.3243479410807292, "learning_rate": 0.0001, "loss": 8.9808, "loss/crossentropy": 2.2249823808670044, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.29463791847229004, "step": 940 }, { "epoch": 0.058875, "grad_norm": 3.25, "grad_norm_var": 0.30130106608072915, "learning_rate": 0.0001, "loss": 8.67, "loss/crossentropy": 2.334540367126465, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30720797181129456, "step": 942 }, { "epoch": 0.059, "grad_norm": 3.921875, "grad_norm_var": 0.2997029622395833, "learning_rate": 0.0001, "loss": 8.7021, "loss/crossentropy": 2.498934745788574, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2946365475654602, "step": 944 }, { "epoch": 0.059125, "grad_norm": 3.71875, "grad_norm_var": 0.2964508056640625, "learning_rate": 0.0001, "loss": 8.7885, "loss/crossentropy": 2.0534738898277283, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.27286672592163086, "step": 946 }, { "epoch": 0.05925, "grad_norm": 3.125, "grad_norm_var": 0.3176066080729167, "learning_rate": 0.0001, "loss": 8.8829, "loss/crossentropy": 2.362457275390625, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2855580151081085, "step": 948 }, { "epoch": 0.059375, "grad_norm": 3.6875, "grad_norm_var": 0.30732320149739584, "learning_rate": 0.0001, "loss": 8.7018, "loss/crossentropy": 2.1891767382621765, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.31324711441993713, "step": 950 }, { "epoch": 0.0595, "grad_norm": 3.734375, "grad_norm_var": 0.31344401041666664, "learning_rate": 0.0001, "loss": 8.8816, "loss/crossentropy": 2.3532931804656982, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30756065249443054, "step": 952 }, { "epoch": 0.059625, "grad_norm": 3.265625, "grad_norm_var": 0.06813863118489584, "learning_rate": 0.0001, "loss": 8.7884, "loss/crossentropy": 2.365698456764221, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3289715647697449, "step": 954 }, { "epoch": 0.05975, "grad_norm": 3.671875, "grad_norm_var": 0.0709381103515625, "learning_rate": 0.0001, "loss": 8.9138, "loss/crossentropy": 2.614629626274109, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30402082204818726, "step": 956 }, { "epoch": 0.059875, "grad_norm": 3.59375, "grad_norm_var": 0.07691650390625, "learning_rate": 0.0001, "loss": 8.663, "loss/crossentropy": 2.109455645084381, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2772120535373688, "step": 958 }, { "epoch": 0.06, "grad_norm": 3.765625, "grad_norm_var": 0.0668121337890625, "learning_rate": 0.0001, "loss": 8.8741, "loss/crossentropy": 2.380413055419922, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.3016493618488312, "step": 960 }, { "epoch": 0.060125, "grad_norm": 3.5, "grad_norm_var": 0.06373291015625, "learning_rate": 0.0001, "loss": 8.9362, "loss/crossentropy": 2.506435751914978, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.335014671087265, "step": 962 }, { "epoch": 0.06025, "grad_norm": 3.40625, "grad_norm_var": 0.0563140869140625, "learning_rate": 0.0001, "loss": 8.6434, "loss/crossentropy": 1.9750906229019165, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2565095126628876, "step": 964 }, { "epoch": 0.060375, "grad_norm": 3.75, "grad_norm_var": 0.05491434733072917, "learning_rate": 0.0001, "loss": 8.7565, "loss/crossentropy": 2.2668718099594116, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2805168777704239, "step": 966 }, { "epoch": 0.0605, "grad_norm": 3.21875, "grad_norm_var": 0.044266764322916666, "learning_rate": 0.0001, "loss": 8.718, "loss/crossentropy": 2.2082256078720093, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2930946350097656, "step": 968 }, { "epoch": 0.060625, "grad_norm": 3.84375, "grad_norm_var": 0.0510894775390625, "learning_rate": 0.0001, "loss": 8.8922, "loss/crossentropy": 2.4699355363845825, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31211861968040466, "step": 970 }, { "epoch": 0.06075, "grad_norm": 3.609375, "grad_norm_var": 0.07473856608072917, "learning_rate": 0.0001, "loss": 8.6174, "loss/crossentropy": 2.160146117210388, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28759919106960297, "step": 972 }, { "epoch": 0.060875, "grad_norm": 3.09375, "grad_norm_var": 0.07998758951822917, "learning_rate": 0.0001, "loss": 8.9576, "loss/crossentropy": 2.245842933654785, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28311386704444885, "step": 974 }, { "epoch": 0.061, "grad_norm": 3.65625, "grad_norm_var": 0.07842508951822917, "learning_rate": 0.0001, "loss": 8.7692, "loss/crossentropy": 2.3998042345046997, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29600852727890015, "step": 976 }, { "epoch": 0.061125, "grad_norm": 3.53125, "grad_norm_var": 0.07856343587239584, "learning_rate": 0.0001, "loss": 8.7165, "loss/crossentropy": 2.4230899810791016, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2921423017978668, "step": 978 }, { "epoch": 0.06125, "grad_norm": 3.203125, "grad_norm_var": 0.08714090983072917, "learning_rate": 0.0001, "loss": 8.5009, "loss/crossentropy": 2.105591118335724, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.2892334759235382, "step": 980 }, { "epoch": 0.061375, "grad_norm": 3.671875, "grad_norm_var": 0.08603515625, "learning_rate": 0.0001, "loss": 8.8291, "loss/crossentropy": 2.342753052711487, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2922486811876297, "step": 982 }, { "epoch": 0.0615, "grad_norm": 3.046875, "grad_norm_var": 0.09026285807291666, "learning_rate": 0.0001, "loss": 8.7629, "loss/crossentropy": 2.5545257329940796, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29155534505844116, "step": 984 }, { "epoch": 0.061625, "grad_norm": 3.53125, "grad_norm_var": 0.08010152180989584, "learning_rate": 0.0001, "loss": 8.635, "loss/crossentropy": 2.390330672264099, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28914259374141693, "step": 986 }, { "epoch": 0.06175, "grad_norm": 3.109375, "grad_norm_var": 0.039484659830729164, "learning_rate": 0.0001, "loss": 8.6355, "loss/crossentropy": 2.031624495983124, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29533734917640686, "step": 988 }, { "epoch": 0.061875, "grad_norm": 3.625, "grad_norm_var": 0.04781494140625, "learning_rate": 0.0001, "loss": 8.8029, "loss/crossentropy": 2.1612058877944946, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2922551929950714, "step": 990 }, { "epoch": 0.062, "grad_norm": 3.15625, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 8.8484, "loss/crossentropy": 2.2686573266983032, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28837865591049194, "step": 992 }, { "epoch": 0.062125, "grad_norm": 3.953125, "grad_norm_var": 0.08382161458333333, "learning_rate": 0.0001, "loss": 8.9572, "loss/crossentropy": 2.4839664697647095, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.30035223066806793, "step": 994 }, { "epoch": 0.06225, "grad_norm": 3.328125, "grad_norm_var": 0.0792388916015625, "learning_rate": 0.0001, "loss": 8.7334, "loss/crossentropy": 2.434647560119629, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30409903824329376, "step": 996 }, { "epoch": 0.062375, "grad_norm": 4.09375, "grad_norm_var": 0.10009663899739583, "learning_rate": 0.0001, "loss": 8.7896, "loss/crossentropy": 2.349792718887329, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.273041769862175, "step": 998 }, { "epoch": 0.0625, "grad_norm": 2.875, "grad_norm_var": 0.1135162353515625, "learning_rate": 0.0001, "loss": 8.7517, "loss/crossentropy": 2.424883484840393, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.28756730258464813, "step": 1000 }, { "epoch": 0.062625, "grad_norm": 3.1875, "grad_norm_var": 0.1218414306640625, "learning_rate": 0.0001, "loss": 8.5904, "loss/crossentropy": 2.4634835720062256, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28353893756866455, "step": 1002 }, { "epoch": 0.06275, "grad_norm": 3.421875, "grad_norm_var": 0.10761311848958334, "learning_rate": 0.0001, "loss": 8.6432, "loss/crossentropy": 2.3544020652770996, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.291437104344368, "step": 1004 }, { "epoch": 0.062875, "grad_norm": 3.203125, "grad_norm_var": 0.11253255208333333, "learning_rate": 0.0001, "loss": 8.6969, "loss/crossentropy": 2.3183122873306274, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.291623592376709, "step": 1006 }, { "epoch": 0.063, "grad_norm": 3.078125, "grad_norm_var": 0.10125325520833334, "learning_rate": 0.0001, "loss": 8.7425, "loss/crossentropy": 2.5724557638168335, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3042745739221573, "step": 1008 }, { "epoch": 0.063125, "grad_norm": 3.484375, "grad_norm_var": 0.08701171875, "learning_rate": 0.0001, "loss": 8.7429, "loss/crossentropy": 2.393343210220337, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.28968481719493866, "step": 1010 }, { "epoch": 0.06325, "grad_norm": 3.359375, "grad_norm_var": 0.0902740478515625, "learning_rate": 0.0001, "loss": 8.7322, "loss/crossentropy": 2.4881935119628906, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3185275048017502, "step": 1012 }, { "epoch": 0.063375, "grad_norm": 3.3125, "grad_norm_var": 0.05918680826822917, "learning_rate": 0.0001, "loss": 8.6637, "loss/crossentropy": 2.1747263073921204, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27498696744441986, "step": 1014 }, { "epoch": 0.0635, "grad_norm": 3.734375, "grad_norm_var": 0.09000651041666667, "learning_rate": 0.0001, "loss": 8.6898, "loss/crossentropy": 2.5582990646362305, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3123309761285782, "step": 1016 }, { "epoch": 0.063625, "grad_norm": 2.96875, "grad_norm_var": 0.10186258951822917, "learning_rate": 0.0001, "loss": 8.4983, "loss/crossentropy": 2.2065166234970093, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2910768985748291, "step": 1018 }, { "epoch": 0.06375, "grad_norm": 3.171875, "grad_norm_var": 0.10444234212239584, "learning_rate": 0.0001, "loss": 8.5609, "loss/crossentropy": 2.3102041482925415, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2962404191493988, "step": 1020 }, { "epoch": 0.063875, "grad_norm": 3.171875, "grad_norm_var": 0.10598042805989584, "learning_rate": 0.0001, "loss": 8.619, "loss/crossentropy": 2.330891489982605, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.301542192697525, "step": 1022 }, { "epoch": 0.064, "grad_norm": 3.390625, "grad_norm_var": 0.10001627604166667, "learning_rate": 0.0001, "loss": 8.4584, "loss/crossentropy": 2.4117361307144165, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2964586764574051, "step": 1024 }, { "epoch": 0.064125, "grad_norm": 3.28125, "grad_norm_var": 0.0966796875, "learning_rate": 0.0001, "loss": 8.5442, "loss/crossentropy": 2.069741904735565, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2646178603172302, "step": 1026 }, { "epoch": 0.06425, "grad_norm": 3.359375, "grad_norm_var": 0.09484049479166666, "learning_rate": 0.0001, "loss": 8.7027, "loss/crossentropy": 2.4950772523880005, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30084407329559326, "step": 1028 }, { "epoch": 0.064375, "grad_norm": 3.46875, "grad_norm_var": 0.096923828125, "learning_rate": 0.0001, "loss": 8.6711, "loss/crossentropy": 2.3811144828796387, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2816159278154373, "step": 1030 }, { "epoch": 0.0645, "grad_norm": 3.265625, "grad_norm_var": 0.03141988118489583, "learning_rate": 0.0001, "loss": 8.5786, "loss/crossentropy": 2.146743893623352, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2715151458978653, "step": 1032 }, { "epoch": 0.064625, "grad_norm": 3.53125, "grad_norm_var": 0.0274566650390625, "learning_rate": 0.0001, "loss": 8.6314, "loss/crossentropy": 2.453763008117676, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.28644131124019623, "step": 1034 }, { "epoch": 0.06475, "grad_norm": 3.375, "grad_norm_var": 0.024583943684895835, "learning_rate": 0.0001, "loss": 8.4581, "loss/crossentropy": 2.189074158668518, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28300249576568604, "step": 1036 }, { "epoch": 0.064875, "grad_norm": 3.0625, "grad_norm_var": 0.038102213541666666, "learning_rate": 0.0001, "loss": 8.5097, "loss/crossentropy": 2.2738513946533203, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.27797406911849976, "step": 1038 }, { "epoch": 0.065, "grad_norm": 3.3125, "grad_norm_var": 0.0379547119140625, "learning_rate": 0.0001, "loss": 8.7724, "loss/crossentropy": 2.5585055351257324, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3508901298046112, "step": 1040 }, { "epoch": 0.065125, "grad_norm": 3.640625, "grad_norm_var": 0.04047749837239583, "learning_rate": 0.0001, "loss": 8.8154, "loss/crossentropy": 2.3115618228912354, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2781260311603546, "step": 1042 }, { "epoch": 0.06525, "grad_norm": 3.25, "grad_norm_var": 0.0395660400390625, "learning_rate": 0.0001, "loss": 8.5646, "loss/crossentropy": 2.2015340328216553, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29191769659519196, "step": 1044 }, { "epoch": 0.065375, "grad_norm": 3.640625, "grad_norm_var": 0.04152730305989583, "learning_rate": 0.0001, "loss": 8.7064, "loss/crossentropy": 2.4439064264297485, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.31912754476070404, "step": 1046 }, { "epoch": 0.0655, "grad_norm": 3.390625, "grad_norm_var": 0.0404937744140625, "learning_rate": 0.0001, "loss": 8.7495, "loss/crossentropy": 2.611847996711731, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.31853775680065155, "step": 1048 }, { "epoch": 0.065625, "grad_norm": 3.75, "grad_norm_var": 0.04395243326822917, "learning_rate": 0.0001, "loss": 8.5285, "loss/crossentropy": 2.058500051498413, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2707058787345886, "step": 1050 }, { "epoch": 0.06575, "grad_norm": 3.453125, "grad_norm_var": 0.043355305989583336, "learning_rate": 0.0001, "loss": 8.6134, "loss/crossentropy": 2.3460679054260254, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29806579649448395, "step": 1052 }, { "epoch": 0.065875, "grad_norm": 3.359375, "grad_norm_var": 0.028531901041666665, "learning_rate": 0.0001, "loss": 8.7906, "loss/crossentropy": 2.575559377670288, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29479941725730896, "step": 1054 }, { "epoch": 0.066, "grad_norm": 3.5, "grad_norm_var": 0.03673502604166667, "learning_rate": 0.0001, "loss": 8.573, "loss/crossentropy": 2.4485961198806763, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3214751183986664, "step": 1056 }, { "epoch": 0.066125, "grad_norm": 3.28125, "grad_norm_var": 0.0400390625, "learning_rate": 0.0001, "loss": 8.5355, "loss/crossentropy": 2.397768259048462, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.31295061111450195, "step": 1058 }, { "epoch": 0.06625, "grad_norm": 3.21875, "grad_norm_var": 0.040848795572916666, "learning_rate": 0.0001, "loss": 8.4975, "loss/crossentropy": 2.390447735786438, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29537099599838257, "step": 1060 }, { "epoch": 0.066375, "grad_norm": 3.046875, "grad_norm_var": 0.04865620930989583, "learning_rate": 0.0001, "loss": 8.553, "loss/crossentropy": 2.417343854904175, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28230586647987366, "step": 1062 }, { "epoch": 0.0665, "grad_norm": 3.390625, "grad_norm_var": 0.046507771809895834, "learning_rate": 0.0001, "loss": 8.5829, "loss/crossentropy": 2.6324515342712402, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.3016352653503418, "step": 1064 }, { "epoch": 0.066625, "grad_norm": 3.453125, "grad_norm_var": 0.03913472493489583, "learning_rate": 0.0001, "loss": 8.5346, "loss/crossentropy": 2.323632597923279, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2779388278722763, "step": 1066 }, { "epoch": 0.06675, "grad_norm": 2.953125, "grad_norm_var": 0.03640950520833333, "learning_rate": 0.0001, "loss": 8.3832, "loss/crossentropy": 2.393522083759308, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.29860424995422363, "step": 1068 }, { "epoch": 0.066875, "grad_norm": 3.4375, "grad_norm_var": 0.039839680989583334, "learning_rate": 0.0001, "loss": 8.6174, "loss/crossentropy": 2.263484001159668, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2802084982395172, "step": 1070 }, { "epoch": 0.067, "grad_norm": 3.296875, "grad_norm_var": 0.03242085774739583, "learning_rate": 0.0001, "loss": 8.4765, "loss/crossentropy": 2.1152660846710205, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2554662525653839, "step": 1072 }, { "epoch": 0.067125, "grad_norm": 3.171875, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 8.5434, "loss/crossentropy": 2.4824490547180176, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28639310598373413, "step": 1074 }, { "epoch": 0.06725, "grad_norm": 3.0, "grad_norm_var": 0.033014933268229164, "learning_rate": 0.0001, "loss": 8.6253, "loss/crossentropy": 2.5830127000808716, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3099432587623596, "step": 1076 }, { "epoch": 0.067375, "grad_norm": 3.96875, "grad_norm_var": 0.0666168212890625, "learning_rate": 0.0001, "loss": 8.7433, "loss/crossentropy": 2.580743908882141, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2936056852340698, "step": 1078 }, { "epoch": 0.0675, "grad_norm": 3.21875, "grad_norm_var": 0.07141011555989583, "learning_rate": 0.0001, "loss": 8.6721, "loss/crossentropy": 2.350903868675232, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2968801259994507, "step": 1080 }, { "epoch": 0.067625, "grad_norm": 3.5, "grad_norm_var": 0.07302144368489584, "learning_rate": 0.0001, "loss": 8.4574, "loss/crossentropy": 2.257428526878357, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2766515016555786, "step": 1082 }, { "epoch": 0.06775, "grad_norm": 3.265625, "grad_norm_var": 0.06972249348958333, "learning_rate": 0.0001, "loss": 8.6877, "loss/crossentropy": 2.2526296377182007, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28292417526245117, "step": 1084 }, { "epoch": 0.067875, "grad_norm": 3.5, "grad_norm_var": 0.06391499837239584, "learning_rate": 0.0001, "loss": 8.7502, "loss/crossentropy": 2.3044220209121704, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2810298800468445, "step": 1086 }, { "epoch": 0.068, "grad_norm": 3.125, "grad_norm_var": 0.06946614583333334, "learning_rate": 0.0001, "loss": 8.585, "loss/crossentropy": 2.2871402502059937, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27473995089530945, "step": 1088 }, { "epoch": 0.068125, "grad_norm": 3.890625, "grad_norm_var": 0.08155008951822916, "learning_rate": 0.0001, "loss": 8.7181, "loss/crossentropy": 2.49469530582428, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2961750328540802, "step": 1090 }, { "epoch": 0.06825, "grad_norm": 3.15625, "grad_norm_var": 0.07158915201822917, "learning_rate": 0.0001, "loss": 8.6174, "loss/crossentropy": 1.980285882949829, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.27205583453178406, "step": 1092 }, { "epoch": 0.068375, "grad_norm": 3.65625, "grad_norm_var": 0.06383056640625, "learning_rate": 0.0001, "loss": 8.383, "loss/crossentropy": 2.4063356518745422, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2908872812986374, "step": 1094 }, { "epoch": 0.0685, "grad_norm": 3.34375, "grad_norm_var": 0.05968424479166667, "learning_rate": 0.0001, "loss": 8.6962, "loss/crossentropy": 2.3776599168777466, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2903301566839218, "step": 1096 }, { "epoch": 0.068625, "grad_norm": 3.515625, "grad_norm_var": 0.05542704264322917, "learning_rate": 0.0001, "loss": 8.4252, "loss/crossentropy": 2.25793194770813, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2622709423303604, "step": 1098 }, { "epoch": 0.06875, "grad_norm": 3.0625, "grad_norm_var": 0.068017578125, "learning_rate": 0.0001, "loss": 8.4571, "loss/crossentropy": 2.0190887451171875, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.25865359604358673, "step": 1100 }, { "epoch": 0.068875, "grad_norm": 3.65625, "grad_norm_var": 0.15084228515625, "learning_rate": 0.0001, "loss": 8.7145, "loss/crossentropy": 2.419832944869995, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.36829979717731476, "step": 1102 }, { "epoch": 0.069, "grad_norm": 3.203125, "grad_norm_var": 0.18118489583333333, "learning_rate": 0.0001, "loss": 8.8892, "loss/crossentropy": 2.3291234970092773, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.27904945611953735, "step": 1104 }, { "epoch": 0.069125, "grad_norm": 3.75, "grad_norm_var": 0.18000386555989584, "learning_rate": 0.0001, "loss": 8.7385, "loss/crossentropy": 2.3887641429901123, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3103363811969757, "step": 1106 }, { "epoch": 0.06925, "grad_norm": 3.171875, "grad_norm_var": 0.1803863525390625, "learning_rate": 0.0001, "loss": 8.4626, "loss/crossentropy": 2.2511096000671387, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2744671106338501, "step": 1108 }, { "epoch": 0.069375, "grad_norm": 3.453125, "grad_norm_var": 0.16078999837239583, "learning_rate": 0.0001, "loss": 8.6296, "loss/crossentropy": 2.251457929611206, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2970822751522064, "step": 1110 }, { "epoch": 0.0695, "grad_norm": 3.171875, "grad_norm_var": 0.18325907389322918, "learning_rate": 0.0001, "loss": 8.4855, "loss/crossentropy": 2.3782349824905396, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.30054476857185364, "step": 1112 }, { "epoch": 0.069625, "grad_norm": 3.375, "grad_norm_var": 0.18281962076822916, "learning_rate": 0.0001, "loss": 8.6186, "loss/crossentropy": 2.605000376701355, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.298698827624321, "step": 1114 }, { "epoch": 0.06975, "grad_norm": 3.34375, "grad_norm_var": 0.15660807291666667, "learning_rate": 0.0001, "loss": 8.5642, "loss/crossentropy": 2.2239125967025757, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.284654900431633, "step": 1116 }, { "epoch": 0.069875, "grad_norm": 3.3125, "grad_norm_var": 0.09190165201822917, "learning_rate": 0.0001, "loss": 8.6873, "loss/crossentropy": 2.3841261863708496, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2972448319196701, "step": 1118 }, { "epoch": 0.07, "grad_norm": 3.53125, "grad_norm_var": 0.04534098307291667, "learning_rate": 0.0001, "loss": 8.4881, "loss/crossentropy": 2.251163959503174, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2940330058336258, "step": 1120 }, { "epoch": 0.070125, "grad_norm": 3.640625, "grad_norm_var": 0.041552734375, "learning_rate": 0.0001, "loss": 8.6624, "loss/crossentropy": 2.390018582344055, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.31052152812480927, "step": 1122 }, { "epoch": 0.07025, "grad_norm": 2.875, "grad_norm_var": 0.06489156087239584, "learning_rate": 0.0001, "loss": 8.5129, "loss/crossentropy": 2.2108170986175537, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2933100759983063, "step": 1124 }, { "epoch": 0.070375, "grad_norm": 3.25, "grad_norm_var": 0.06334635416666666, "learning_rate": 0.0001, "loss": 8.4376, "loss/crossentropy": 2.2771471738815308, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28437741100788116, "step": 1126 }, { "epoch": 0.0705, "grad_norm": 3.453125, "grad_norm_var": 0.05369364420572917, "learning_rate": 0.0001, "loss": 8.4285, "loss/crossentropy": 2.2954673767089844, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2847554385662079, "step": 1128 }, { "epoch": 0.070625, "grad_norm": 3.3125, "grad_norm_var": 0.0545318603515625, "learning_rate": 0.0001, "loss": 8.4656, "loss/crossentropy": 2.271798253059387, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2639819011092186, "step": 1130 }, { "epoch": 0.07075, "grad_norm": 3.78125, "grad_norm_var": 0.0674957275390625, "learning_rate": 0.0001, "loss": 8.6624, "loss/crossentropy": 2.5219074487686157, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30493326485157013, "step": 1132 }, { "epoch": 0.070875, "grad_norm": 2.75, "grad_norm_var": 0.09687093098958334, "learning_rate": 0.0001, "loss": 8.3251, "loss/crossentropy": 2.419742465019226, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29505568742752075, "step": 1134 }, { "epoch": 0.071, "grad_norm": 3.734375, "grad_norm_var": 0.1082916259765625, "learning_rate": 0.0001, "loss": 8.6851, "loss/crossentropy": 2.460718870162964, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2991577684879303, "step": 1136 }, { "epoch": 0.071125, "grad_norm": 3.140625, "grad_norm_var": 0.08157145182291667, "learning_rate": 0.0001, "loss": 8.5178, "loss/crossentropy": 2.173828959465027, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.27509088814258575, "step": 1138 }, { "epoch": 0.07125, "grad_norm": 3.109375, "grad_norm_var": 0.07099202473958334, "learning_rate": 0.0001, "loss": 8.2813, "loss/crossentropy": 2.0987906455993652, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26560700684785843, "step": 1140 }, { "epoch": 0.071375, "grad_norm": 3.15625, "grad_norm_var": 0.0721588134765625, "learning_rate": 0.0001, "loss": 8.5089, "loss/crossentropy": 2.3777287006378174, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.27554096281528473, "step": 1142 }, { "epoch": 0.0715, "grad_norm": 3.390625, "grad_norm_var": 0.07099507649739584, "learning_rate": 0.0001, "loss": 8.3986, "loss/crossentropy": 2.294643998146057, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28792035579681396, "step": 1144 }, { "epoch": 0.071625, "grad_norm": 3.3125, "grad_norm_var": 0.07714436848958334, "learning_rate": 0.0001, "loss": 8.5452, "loss/crossentropy": 2.3531687259674072, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2780257761478424, "step": 1146 }, { "epoch": 0.07175, "grad_norm": 3.15625, "grad_norm_var": 0.06738993326822916, "learning_rate": 0.0001, "loss": 8.5518, "loss/crossentropy": 2.0771710872650146, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26807837188243866, "step": 1148 }, { "epoch": 0.071875, "grad_norm": 3.421875, "grad_norm_var": 0.04797770182291667, "learning_rate": 0.0001, "loss": 8.5416, "loss/crossentropy": 2.18049418926239, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.2739409804344177, "step": 1150 }, { "epoch": 0.072, "grad_norm": 3.09375, "grad_norm_var": 0.03434956868489583, "learning_rate": 0.0001, "loss": 8.4693, "loss/crossentropy": 2.666857123374939, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30170081555843353, "step": 1152 }, { "epoch": 0.072125, "grad_norm": 3.171875, "grad_norm_var": 0.06629130045572916, "learning_rate": 0.0001, "loss": 8.6786, "loss/crossentropy": 2.5841041803359985, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.30311837792396545, "step": 1154 }, { "epoch": 0.07225, "grad_norm": 3.171875, "grad_norm_var": 0.0630767822265625, "learning_rate": 0.0001, "loss": 8.5017, "loss/crossentropy": 2.2012165784835815, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2716425508260727, "step": 1156 }, { "epoch": 0.072375, "grad_norm": 3.171875, "grad_norm_var": 0.06253255208333333, "learning_rate": 0.0001, "loss": 8.5665, "loss/crossentropy": 2.2168599367141724, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30834463238716125, "step": 1158 }, { "epoch": 0.0725, "grad_norm": 4.15625, "grad_norm_var": 0.1092193603515625, "learning_rate": 0.0001, "loss": 8.4126, "loss/crossentropy": 2.1044594049453735, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2663833498954773, "step": 1160 }, { "epoch": 0.072625, "grad_norm": 2.828125, "grad_norm_var": 0.11414388020833334, "learning_rate": 0.0001, "loss": 8.3621, "loss/crossentropy": 2.109754800796509, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.27043384313583374, "step": 1162 }, { "epoch": 0.07275, "grad_norm": 3.53125, "grad_norm_var": 0.1210845947265625, "learning_rate": 0.0001, "loss": 8.4745, "loss/crossentropy": 2.384516477584839, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27273619174957275, "step": 1164 }, { "epoch": 0.072875, "grad_norm": 2.984375, "grad_norm_var": 0.1336090087890625, "learning_rate": 0.0001, "loss": 8.4642, "loss/crossentropy": 2.214682459831238, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.27829450368881226, "step": 1166 }, { "epoch": 0.073, "grad_norm": 3.4375, "grad_norm_var": 0.12939046223958334, "learning_rate": 0.0001, "loss": 8.5784, "loss/crossentropy": 2.276426315307617, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29027001559734344, "step": 1168 }, { "epoch": 0.073125, "grad_norm": 3.09375, "grad_norm_var": 0.109130859375, "learning_rate": 0.0001, "loss": 8.353, "loss/crossentropy": 2.1743494272232056, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.26005150377750397, "step": 1170 }, { "epoch": 0.07325, "grad_norm": 3.421875, "grad_norm_var": 0.11611226399739584, "learning_rate": 0.0001, "loss": 8.4193, "loss/crossentropy": 2.279123902320862, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2806738466024399, "step": 1172 }, { "epoch": 0.073375, "grad_norm": 3.390625, "grad_norm_var": 0.11841532389322916, "learning_rate": 0.0001, "loss": 8.5702, "loss/crossentropy": 2.050893008708954, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.253355473279953, "step": 1174 }, { "epoch": 0.0735, "grad_norm": 3.375, "grad_norm_var": 0.07099202473958334, "learning_rate": 0.0001, "loss": 8.7427, "loss/crossentropy": 2.5286508798599243, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.31759728491306305, "step": 1176 }, { "epoch": 0.073625, "grad_norm": 3.3125, "grad_norm_var": 0.0608795166015625, "learning_rate": 0.0001, "loss": 8.6737, "loss/crossentropy": 2.445157289505005, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.29573802649974823, "step": 1178 }, { "epoch": 0.07375, "grad_norm": 3.46875, "grad_norm_var": 0.05756734212239583, "learning_rate": 0.0001, "loss": 8.5287, "loss/crossentropy": 2.3890769481658936, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2753005623817444, "step": 1180 }, { "epoch": 0.073875, "grad_norm": 3.375, "grad_norm_var": 0.052099609375, "learning_rate": 0.0001, "loss": 8.5552, "loss/crossentropy": 2.2562392950057983, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.27074071764945984, "step": 1182 }, { "epoch": 0.074, "grad_norm": 3.1875, "grad_norm_var": 0.0451568603515625, "learning_rate": 0.0001, "loss": 8.4914, "loss/crossentropy": 1.960713267326355, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.264203280210495, "step": 1184 }, { "epoch": 0.074125, "grad_norm": 3.265625, "grad_norm_var": 0.036279296875, "learning_rate": 0.0001, "loss": 8.4506, "loss/crossentropy": 2.357021927833557, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28563813865184784, "step": 1186 }, { "epoch": 0.07425, "grad_norm": 3.125, "grad_norm_var": 0.03333333333333333, "learning_rate": 0.0001, "loss": 8.3867, "loss/crossentropy": 2.1780192852020264, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26176655292510986, "step": 1188 }, { "epoch": 0.074375, "grad_norm": 3.328125, "grad_norm_var": 0.034228515625, "learning_rate": 0.0001, "loss": 8.4641, "loss/crossentropy": 2.075110673904419, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.29416830837726593, "step": 1190 }, { "epoch": 0.0745, "grad_norm": 3.09375, "grad_norm_var": 0.0424468994140625, "learning_rate": 0.0001, "loss": 8.3866, "loss/crossentropy": 2.152463436126709, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2740413099527359, "step": 1192 }, { "epoch": 0.074625, "grad_norm": 3.28125, "grad_norm_var": 0.039449055989583336, "learning_rate": 0.0001, "loss": 8.4003, "loss/crossentropy": 2.381898283958435, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.29159918427467346, "step": 1194 }, { "epoch": 0.07475, "grad_norm": 2.84375, "grad_norm_var": 0.04258524576822917, "learning_rate": 0.0001, "loss": 8.5591, "loss/crossentropy": 2.6784266233444214, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30832037329673767, "step": 1196 }, { "epoch": 0.074875, "grad_norm": 3.296875, "grad_norm_var": 0.0288238525390625, "learning_rate": 0.0001, "loss": 8.477, "loss/crossentropy": 2.1786980628967285, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28018996119499207, "step": 1198 }, { "epoch": 0.075, "grad_norm": 3.234375, "grad_norm_var": 0.0292877197265625, "learning_rate": 0.0001, "loss": 8.6585, "loss/crossentropy": 2.248537063598633, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28961239755153656, "step": 1200 }, { "epoch": 0.075125, "grad_norm": 3.140625, "grad_norm_var": 0.029621378580729166, "learning_rate": 0.0001, "loss": 8.4601, "loss/crossentropy": 2.343456268310547, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2769605219364166, "step": 1202 }, { "epoch": 0.07525, "grad_norm": 3.140625, "grad_norm_var": 0.0279449462890625, "learning_rate": 0.0001, "loss": 8.5551, "loss/crossentropy": 2.3451120853424072, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.31704986095428467, "step": 1204 }, { "epoch": 0.075375, "grad_norm": 4.125, "grad_norm_var": 0.07421875, "learning_rate": 0.0001, "loss": 8.2784, "loss/crossentropy": 2.2383298873901367, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2643394321203232, "step": 1206 }, { "epoch": 0.0755, "grad_norm": 3.9375, "grad_norm_var": 0.10383707682291667, "learning_rate": 0.0001, "loss": 8.6391, "loss/crossentropy": 2.0706650018692017, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2589013874530792, "step": 1208 }, { "epoch": 0.075625, "grad_norm": 3.375, "grad_norm_var": 0.10327046712239583, "learning_rate": 0.0001, "loss": 8.6296, "loss/crossentropy": 2.264005422592163, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.28355173766613007, "step": 1210 }, { "epoch": 0.07575, "grad_norm": 3.078125, "grad_norm_var": 0.09348042805989583, "learning_rate": 0.0001, "loss": 8.3618, "loss/crossentropy": 2.4925496578216553, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2993692457675934, "step": 1212 }, { "epoch": 0.075875, "grad_norm": 3.375, "grad_norm_var": 0.09510091145833334, "learning_rate": 0.0001, "loss": 8.5563, "loss/crossentropy": 2.339760661125183, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27024510502815247, "step": 1214 }, { "epoch": 0.076, "grad_norm": 3.1875, "grad_norm_var": 0.10005594889322916, "learning_rate": 0.0001, "loss": 8.5717, "loss/crossentropy": 2.357187867164612, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3173932731151581, "step": 1216 }, { "epoch": 0.076125, "grad_norm": 3.140625, "grad_norm_var": 0.10143229166666666, "learning_rate": 0.0001, "loss": 8.3378, "loss/crossentropy": 2.5177258253097534, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.29622797667980194, "step": 1218 }, { "epoch": 0.07625, "grad_norm": 3.25, "grad_norm_var": 0.098779296875, "learning_rate": 0.0001, "loss": 8.4854, "loss/crossentropy": 2.0992863178253174, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2855496108531952, "step": 1220 }, { "epoch": 0.076375, "grad_norm": 3.03125, "grad_norm_var": 0.0710601806640625, "learning_rate": 0.0001, "loss": 8.3916, "loss/crossentropy": 2.4129068851470947, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.280709832906723, "step": 1222 }, { "epoch": 0.0765, "grad_norm": 3.359375, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 8.3373, "loss/crossentropy": 2.0929447412490845, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.25636833161115646, "step": 1224 }, { "epoch": 0.076625, "grad_norm": 3.0, "grad_norm_var": 0.027326456705729165, "learning_rate": 0.0001, "loss": 8.4648, "loss/crossentropy": 2.427080750465393, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2882840186357498, "step": 1226 }, { "epoch": 0.07675, "grad_norm": 3.140625, "grad_norm_var": 0.026585896809895832, "learning_rate": 0.0001, "loss": 8.3397, "loss/crossentropy": 2.4737452268600464, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28378529846668243, "step": 1228 }, { "epoch": 0.076875, "grad_norm": 3.234375, "grad_norm_var": 0.0234771728515625, "learning_rate": 0.0001, "loss": 8.6966, "loss/crossentropy": 2.6298500299453735, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.306839257478714, "step": 1230 }, { "epoch": 0.077, "grad_norm": 2.859375, "grad_norm_var": 0.03771158854166667, "learning_rate": 0.0001, "loss": 8.4196, "loss/crossentropy": 2.3488998413085938, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.27983449399471283, "step": 1232 }, { "epoch": 0.077125, "grad_norm": 3.828125, "grad_norm_var": 0.06613667805989583, "learning_rate": 0.0001, "loss": 8.5206, "loss/crossentropy": 2.2696053981781006, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2854994237422943, "step": 1234 }, { "epoch": 0.07725, "grad_norm": 2.765625, "grad_norm_var": 0.07351888020833333, "learning_rate": 0.0001, "loss": 8.2717, "loss/crossentropy": 2.174792766571045, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2614743113517761, "step": 1236 }, { "epoch": 0.077375, "grad_norm": 3.921875, "grad_norm_var": 0.09851786295572916, "learning_rate": 0.0001, "loss": 8.6137, "loss/crossentropy": 2.2606674432754517, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28789395093917847, "step": 1238 }, { "epoch": 0.0775, "grad_norm": 3.921875, "grad_norm_var": 0.12114969889322917, "learning_rate": 0.0001, "loss": 8.4582, "loss/crossentropy": 2.104749917984009, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27152082324028015, "step": 1240 }, { "epoch": 0.077625, "grad_norm": 3.28125, "grad_norm_var": 0.1411041259765625, "learning_rate": 0.0001, "loss": 8.3861, "loss/crossentropy": 2.198368549346924, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.26415789127349854, "step": 1242 }, { "epoch": 0.07775, "grad_norm": 3.015625, "grad_norm_var": 0.1497711181640625, "learning_rate": 0.0001, "loss": 8.5154, "loss/crossentropy": 2.4816545248031616, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30805790424346924, "step": 1244 }, { "epoch": 0.077875, "grad_norm": 3.515625, "grad_norm_var": 0.1504547119140625, "learning_rate": 0.0001, "loss": 8.5166, "loss/crossentropy": 2.4640896320343018, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29878415167331696, "step": 1246 }, { "epoch": 0.078, "grad_norm": 2.796875, "grad_norm_var": 0.15390218098958333, "learning_rate": 0.0001, "loss": 8.3599, "loss/crossentropy": 2.2658848762512207, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.264192171394825, "step": 1248 }, { "epoch": 0.078125, "grad_norm": 3.5, "grad_norm_var": 0.142724609375, "learning_rate": 0.0001, "loss": 8.3164, "loss/crossentropy": 2.1921703815460205, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2650914490222931, "step": 1250 }, { "epoch": 0.07825, "grad_norm": 2.921875, "grad_norm_var": 0.1355865478515625, "learning_rate": 0.0001, "loss": 8.5194, "loss/crossentropy": 2.2807745933532715, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28195033967494965, "step": 1252 }, { "epoch": 0.078375, "grad_norm": 3.34375, "grad_norm_var": 0.12167561848958333, "learning_rate": 0.0001, "loss": 8.4772, "loss/crossentropy": 2.145294189453125, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2704995721578598, "step": 1254 }, { "epoch": 0.0785, "grad_norm": 2.859375, "grad_norm_var": 0.1064849853515625, "learning_rate": 0.0001, "loss": 8.2792, "loss/crossentropy": 2.3187735080718994, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28507962822914124, "step": 1256 }, { "epoch": 0.078625, "grad_norm": 3.734375, "grad_norm_var": 0.08872782389322917, "learning_rate": 0.0001, "loss": 8.4454, "loss/crossentropy": 2.226912260055542, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2773582488298416, "step": 1258 }, { "epoch": 0.07875, "grad_norm": 2.921875, "grad_norm_var": 0.08684488932291666, "learning_rate": 0.0001, "loss": 8.4253, "loss/crossentropy": 2.524247169494629, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.28448525071144104, "step": 1260 }, { "epoch": 0.078875, "grad_norm": 3.109375, "grad_norm_var": 0.08153889973958334, "learning_rate": 0.0001, "loss": 8.4428, "loss/crossentropy": 2.434785842895508, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2782677710056305, "step": 1262 }, { "epoch": 0.079, "grad_norm": 3.171875, "grad_norm_var": 0.09170633951822917, "learning_rate": 0.0001, "loss": 8.3836, "loss/crossentropy": 2.0830533504486084, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27287817001342773, "step": 1264 }, { "epoch": 0.079125, "grad_norm": 3.265625, "grad_norm_var": 0.09251200358072917, "learning_rate": 0.0001, "loss": 8.3748, "loss/crossentropy": 2.2747669219970703, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3025789111852646, "step": 1266 }, { "epoch": 0.07925, "grad_norm": 3.140625, "grad_norm_var": 0.07610270182291666, "learning_rate": 0.0001, "loss": 8.4631, "loss/crossentropy": 2.2862067222595215, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2721339762210846, "step": 1268 }, { "epoch": 0.079375, "grad_norm": 3.21875, "grad_norm_var": 0.07183837890625, "learning_rate": 0.0001, "loss": 8.4595, "loss/crossentropy": 2.3111391067504883, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2573637366294861, "step": 1270 }, { "epoch": 0.0795, "grad_norm": 3.0625, "grad_norm_var": 0.06552632649739583, "learning_rate": 0.0001, "loss": 8.3966, "loss/crossentropy": 2.5623552799224854, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.26713909208774567, "step": 1272 }, { "epoch": 0.079625, "grad_norm": 3.15625, "grad_norm_var": 0.04524637858072917, "learning_rate": 0.0001, "loss": 8.2061, "loss/crossentropy": 2.1350300312042236, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.26287516951560974, "step": 1274 }, { "epoch": 0.07975, "grad_norm": 3.046875, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 8.5091, "loss/crossentropy": 2.3671988248825073, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.274411678314209, "step": 1276 }, { "epoch": 0.079875, "grad_norm": 3.109375, "grad_norm_var": 0.034989420572916666, "learning_rate": 0.0001, "loss": 8.4391, "loss/crossentropy": 2.280188202857971, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2855434864759445, "step": 1278 }, { "epoch": 0.08, "grad_norm": 3.34375, "grad_norm_var": 0.013997395833333334, "learning_rate": 0.0001, "loss": 8.561, "loss/crossentropy": 2.09197735786438, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2575538009405136, "step": 1280 }, { "epoch": 0.080125, "grad_norm": 2.859375, "grad_norm_var": 0.0140045166015625, "learning_rate": 0.0001, "loss": 8.374, "loss/crossentropy": 2.199427366256714, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2671803832054138, "step": 1282 }, { "epoch": 0.08025, "grad_norm": 3.15625, "grad_norm_var": 0.016792805989583333, "learning_rate": 0.0001, "loss": 8.4494, "loss/crossentropy": 2.646793484687805, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26508933305740356, "step": 1284 }, { "epoch": 0.080375, "grad_norm": 3.125, "grad_norm_var": 0.017378743489583334, "learning_rate": 0.0001, "loss": 8.3257, "loss/crossentropy": 2.255902647972107, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26800450682640076, "step": 1286 }, { "epoch": 0.0805, "grad_norm": 3.703125, "grad_norm_var": 0.03821207682291667, "learning_rate": 0.0001, "loss": 8.6049, "loss/crossentropy": 2.4666751623153687, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2575417757034302, "step": 1288 }, { "epoch": 0.080625, "grad_norm": 3.234375, "grad_norm_var": 0.038874308268229164, "learning_rate": 0.0001, "loss": 8.4277, "loss/crossentropy": 2.2299275398254395, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26189981400966644, "step": 1290 }, { "epoch": 0.08075, "grad_norm": 3.234375, "grad_norm_var": 0.0371978759765625, "learning_rate": 0.0001, "loss": 8.3189, "loss/crossentropy": 2.3310989141464233, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26487114280462265, "step": 1292 }, { "epoch": 0.080875, "grad_norm": 3.6875, "grad_norm_var": 0.05325520833333333, "learning_rate": 0.0001, "loss": 8.4445, "loss/crossentropy": 2.2357208728790283, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27044905722141266, "step": 1294 }, { "epoch": 0.081, "grad_norm": 2.828125, "grad_norm_var": 0.0831451416015625, "learning_rate": 0.0001, "loss": 8.2945, "loss/crossentropy": 2.4127997159957886, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2716076225042343, "step": 1296 }, { "epoch": 0.081125, "grad_norm": 3.109375, "grad_norm_var": 0.0810546875, "learning_rate": 0.0001, "loss": 8.1039, "loss/crossentropy": 2.271215081214905, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.26447129994630814, "step": 1298 }, { "epoch": 0.08125, "grad_norm": 2.953125, "grad_norm_var": 0.08394266764322916, "learning_rate": 0.0001, "loss": 8.4898, "loss/crossentropy": 2.214189291000366, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26683974266052246, "step": 1300 }, { "epoch": 0.081375, "grad_norm": 3.03125, "grad_norm_var": 0.09244791666666667, "learning_rate": 0.0001, "loss": 8.4019, "loss/crossentropy": 2.1998738050460815, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2771088480949402, "step": 1302 }, { "epoch": 0.0815, "grad_norm": 3.515625, "grad_norm_var": 0.08205464680989584, "learning_rate": 0.0001, "loss": 8.391, "loss/crossentropy": 2.10916006565094, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26495426893234253, "step": 1304 }, { "epoch": 0.081625, "grad_norm": 3.015625, "grad_norm_var": 0.111962890625, "learning_rate": 0.0001, "loss": 8.4097, "loss/crossentropy": 2.428833842277527, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28503939509391785, "step": 1306 }, { "epoch": 0.08175, "grad_norm": 3.046875, "grad_norm_var": 0.1131988525390625, "learning_rate": 0.0001, "loss": 8.441, "loss/crossentropy": 2.512449622154236, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.33712296187877655, "step": 1308 }, { "epoch": 0.081875, "grad_norm": 3.28125, "grad_norm_var": 0.088330078125, "learning_rate": 0.0001, "loss": 8.3889, "loss/crossentropy": 2.438145875930786, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2805769294500351, "step": 1310 }, { "epoch": 0.082, "grad_norm": 3.03125, "grad_norm_var": 0.0719390869140625, "learning_rate": 0.0001, "loss": 8.2675, "loss/crossentropy": 2.35923433303833, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2907231003046036, "step": 1312 }, { "epoch": 0.082125, "grad_norm": 3.125, "grad_norm_var": 0.07124735514322916, "learning_rate": 0.0001, "loss": 8.3908, "loss/crossentropy": 2.288873791694641, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2815335690975189, "step": 1314 }, { "epoch": 0.08225, "grad_norm": 3.125, "grad_norm_var": 0.0664459228515625, "learning_rate": 0.0001, "loss": 8.4692, "loss/crossentropy": 2.4051743745803833, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.282541960477829, "step": 1316 }, { "epoch": 0.082375, "grad_norm": 3.734375, "grad_norm_var": 0.0790924072265625, "learning_rate": 0.0001, "loss": 8.2673, "loss/crossentropy": 2.115163564682007, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26003655791282654, "step": 1318 }, { "epoch": 0.0825, "grad_norm": 2.984375, "grad_norm_var": 0.07752278645833334, "learning_rate": 0.0001, "loss": 8.3407, "loss/crossentropy": 2.294703960418701, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2557590380311012, "step": 1320 }, { "epoch": 0.082625, "grad_norm": 3.140625, "grad_norm_var": 0.041356404622395836, "learning_rate": 0.0001, "loss": 8.3636, "loss/crossentropy": 2.6248332262039185, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2877664119005203, "step": 1322 }, { "epoch": 0.08275, "grad_norm": 3.234375, "grad_norm_var": 0.04487202962239583, "learning_rate": 0.0001, "loss": 8.392, "loss/crossentropy": 2.212457776069641, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2759707272052765, "step": 1324 }, { "epoch": 0.082875, "grad_norm": 3.015625, "grad_norm_var": 0.0431304931640625, "learning_rate": 0.0001, "loss": 8.3794, "loss/crossentropy": 1.8898176550865173, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28947535157203674, "step": 1326 }, { "epoch": 0.083, "grad_norm": 3.1875, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 8.4226, "loss/crossentropy": 2.3176772594451904, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.30063313245773315, "step": 1328 }, { "epoch": 0.083125, "grad_norm": 3.421875, "grad_norm_var": 0.04169514973958333, "learning_rate": 0.0001, "loss": 8.033, "loss/crossentropy": 1.9844502806663513, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.22963125258684158, "step": 1330 }, { "epoch": 0.08325, "grad_norm": 2.984375, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 8.3364, "loss/crossentropy": 2.0860679745674133, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28032663464546204, "step": 1332 }, { "epoch": 0.083375, "grad_norm": 3.171875, "grad_norm_var": 0.028120930989583334, "learning_rate": 0.0001, "loss": 8.2641, "loss/crossentropy": 2.2230706214904785, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26876039803028107, "step": 1334 }, { "epoch": 0.0835, "grad_norm": 3.28125, "grad_norm_var": 0.026904296875, "learning_rate": 0.0001, "loss": 8.4189, "loss/crossentropy": 2.307973623275757, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2883561700582504, "step": 1336 }, { "epoch": 0.083625, "grad_norm": 3.390625, "grad_norm_var": 0.0290679931640625, "learning_rate": 0.0001, "loss": 8.292, "loss/crossentropy": 2.3308621644973755, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26050567626953125, "step": 1338 }, { "epoch": 0.08375, "grad_norm": 3.40625, "grad_norm_var": 0.03827718098958333, "learning_rate": 0.0001, "loss": 8.2063, "loss/crossentropy": 2.1553597450256348, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27626167237758636, "step": 1340 }, { "epoch": 0.083875, "grad_norm": 3.09375, "grad_norm_var": 0.08676656087239583, "learning_rate": 0.0001, "loss": 8.5019, "loss/crossentropy": 2.21061909198761, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.27770161628723145, "step": 1342 }, { "epoch": 0.084, "grad_norm": 2.734375, "grad_norm_var": 0.1104400634765625, "learning_rate": 0.0001, "loss": 8.222, "loss/crossentropy": 2.2863982915878296, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2523474544286728, "step": 1344 }, { "epoch": 0.084125, "grad_norm": 3.09375, "grad_norm_var": 0.11259358723958333, "learning_rate": 0.0001, "loss": 8.307, "loss/crossentropy": 2.4110026359558105, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26567649841308594, "step": 1346 }, { "epoch": 0.08425, "grad_norm": 3.265625, "grad_norm_var": 0.10590718587239584, "learning_rate": 0.0001, "loss": 8.3584, "loss/crossentropy": 2.3141287565231323, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2731921225786209, "step": 1348 }, { "epoch": 0.084375, "grad_norm": 3.140625, "grad_norm_var": 0.09840087890625, "learning_rate": 0.0001, "loss": 8.3221, "loss/crossentropy": 2.4939377307891846, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2732496112585068, "step": 1350 }, { "epoch": 0.0845, "grad_norm": 3.09375, "grad_norm_var": 0.09888916015625, "learning_rate": 0.0001, "loss": 8.4167, "loss/crossentropy": 2.6840182542800903, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2915331721305847, "step": 1352 }, { "epoch": 0.084625, "grad_norm": 2.984375, "grad_norm_var": 0.1009674072265625, "learning_rate": 0.0001, "loss": 8.361, "loss/crossentropy": 2.3241279125213623, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.25332190841436386, "step": 1354 }, { "epoch": 0.08475, "grad_norm": 2.96875, "grad_norm_var": 0.09444071451822916, "learning_rate": 0.0001, "loss": 8.128, "loss/crossentropy": 2.2505773305892944, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27977539598941803, "step": 1356 }, { "epoch": 0.084875, "grad_norm": 3.609375, "grad_norm_var": 0.04345296223958333, "learning_rate": 0.0001, "loss": 8.383, "loss/crossentropy": 2.4839106798171997, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2985747307538986, "step": 1358 }, { "epoch": 0.085, "grad_norm": 3.046875, "grad_norm_var": 0.03417561848958333, "learning_rate": 0.0001, "loss": 8.3302, "loss/crossentropy": 2.1147927045822144, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26798177510499954, "step": 1360 }, { "epoch": 0.085125, "grad_norm": 3.03125, "grad_norm_var": 0.03313395182291667, "learning_rate": 0.0001, "loss": 8.2878, "loss/crossentropy": 2.341870665550232, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.29636865854263306, "step": 1362 }, { "epoch": 0.08525, "grad_norm": 3.296875, "grad_norm_var": 0.033772786458333336, "learning_rate": 0.0001, "loss": 8.3404, "loss/crossentropy": 2.3820383548736572, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.308891698718071, "step": 1364 }, { "epoch": 0.085375, "grad_norm": 2.984375, "grad_norm_var": 0.03592122395833333, "learning_rate": 0.0001, "loss": 8.2509, "loss/crossentropy": 2.406251907348633, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2710695117712021, "step": 1366 }, { "epoch": 0.0855, "grad_norm": 3.09375, "grad_norm_var": 0.04088541666666667, "learning_rate": 0.0001, "loss": 8.4671, "loss/crossentropy": 2.4824094772338867, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2687048017978668, "step": 1368 }, { "epoch": 0.085625, "grad_norm": 3.140625, "grad_norm_var": 0.0371490478515625, "learning_rate": 0.0001, "loss": 8.2799, "loss/crossentropy": 2.1223180890083313, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25326162576675415, "step": 1370 }, { "epoch": 0.08575, "grad_norm": 3.015625, "grad_norm_var": 0.0335113525390625, "learning_rate": 0.0001, "loss": 8.2298, "loss/crossentropy": 2.3254255056381226, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28770148754119873, "step": 1372 }, { "epoch": 0.085875, "grad_norm": 3.0, "grad_norm_var": 0.018928019205729167, "learning_rate": 0.0001, "loss": 8.405, "loss/crossentropy": 2.369110345840454, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2755488455295563, "step": 1374 }, { "epoch": 0.086, "grad_norm": 2.90625, "grad_norm_var": 0.018919881184895834, "learning_rate": 0.0001, "loss": 8.1925, "loss/crossentropy": 2.5451020002365112, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.27585017681121826, "step": 1376 }, { "epoch": 0.086125, "grad_norm": 3.109375, "grad_norm_var": 0.018876139322916666, "learning_rate": 0.0001, "loss": 8.5378, "loss/crossentropy": 2.5744348764419556, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28956979513168335, "step": 1378 }, { "epoch": 0.08625, "grad_norm": 3.046875, "grad_norm_var": 0.014729817708333334, "learning_rate": 0.0001, "loss": 8.3531, "loss/crossentropy": 2.0667566061019897, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2600719928741455, "step": 1380 }, { "epoch": 0.086375, "grad_norm": 3.1875, "grad_norm_var": 0.017464192708333333, "learning_rate": 0.0001, "loss": 8.3839, "loss/crossentropy": 2.383557438850403, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.279419407248497, "step": 1382 }, { "epoch": 0.0865, "grad_norm": 3.125, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 8.3527, "loss/crossentropy": 2.262708902359009, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26019924879074097, "step": 1384 }, { "epoch": 0.086625, "grad_norm": 3.28125, "grad_norm_var": 0.01695556640625, "learning_rate": 0.0001, "loss": 8.4089, "loss/crossentropy": 2.422199249267578, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.257827490568161, "step": 1386 }, { "epoch": 0.08675, "grad_norm": 3.09375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 8.2639, "loss/crossentropy": 2.341743230819702, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2593771815299988, "step": 1388 }, { "epoch": 0.086875, "grad_norm": 3.828125, "grad_norm_var": 0.042578125, "learning_rate": 0.0001, "loss": 8.2408, "loss/crossentropy": 2.2627909183502197, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.262659452855587, "step": 1390 }, { "epoch": 0.087, "grad_norm": 2.84375, "grad_norm_var": 0.05591532389322917, "learning_rate": 0.0001, "loss": 8.202, "loss/crossentropy": 2.2393475770950317, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2788470536470413, "step": 1392 }, { "epoch": 0.087125, "grad_norm": 3.4375, "grad_norm_var": 0.06106363932291667, "learning_rate": 0.0001, "loss": 8.468, "loss/crossentropy": 2.256345748901367, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2531740814447403, "step": 1394 }, { "epoch": 0.08725, "grad_norm": 3.359375, "grad_norm_var": 0.06864827473958333, "learning_rate": 0.0001, "loss": 8.3001, "loss/crossentropy": 2.263739228248596, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26853087544441223, "step": 1396 }, { "epoch": 0.087375, "grad_norm": 3.125, "grad_norm_var": 0.067919921875, "learning_rate": 0.0001, "loss": 8.2254, "loss/crossentropy": 2.3907299041748047, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26690760254859924, "step": 1398 }, { "epoch": 0.0875, "grad_norm": 3.15625, "grad_norm_var": 0.06586812337239584, "learning_rate": 0.0001, "loss": 8.4731, "loss/crossentropy": 2.3437803983688354, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.259974405169487, "step": 1400 }, { "epoch": 0.087625, "grad_norm": 2.90625, "grad_norm_var": 0.06672261555989584, "learning_rate": 0.0001, "loss": 8.4484, "loss/crossentropy": 2.3048956394195557, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2741893529891968, "step": 1402 }, { "epoch": 0.08775, "grad_norm": 3.15625, "grad_norm_var": 0.06726888020833334, "learning_rate": 0.0001, "loss": 8.3698, "loss/crossentropy": 2.3523448705673218, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.280557781457901, "step": 1404 }, { "epoch": 0.087875, "grad_norm": 3.25, "grad_norm_var": 0.033524576822916666, "learning_rate": 0.0001, "loss": 8.3848, "loss/crossentropy": 2.3498148918151855, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3289744406938553, "step": 1406 }, { "epoch": 0.088, "grad_norm": 3.9375, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 8.2261, "loss/crossentropy": 2.071715295314789, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26693500578403473, "step": 1408 }, { "epoch": 0.088125, "grad_norm": 3.421875, "grad_norm_var": 0.0634429931640625, "learning_rate": 0.0001, "loss": 8.2024, "loss/crossentropy": 2.333922863006592, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27316156029701233, "step": 1410 }, { "epoch": 0.08825, "grad_norm": 3.265625, "grad_norm_var": 0.05562744140625, "learning_rate": 0.0001, "loss": 8.0937, "loss/crossentropy": 2.1679932475090027, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.25762278586626053, "step": 1412 }, { "epoch": 0.088375, "grad_norm": 2.96875, "grad_norm_var": 0.0583648681640625, "learning_rate": 0.0001, "loss": 8.2188, "loss/crossentropy": 2.3344188928604126, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.32225053012371063, "step": 1414 }, { "epoch": 0.0885, "grad_norm": 3.40625, "grad_norm_var": 0.06533915201822917, "learning_rate": 0.0001, "loss": 8.2794, "loss/crossentropy": 2.181140899658203, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.25673504173755646, "step": 1416 }, { "epoch": 0.088625, "grad_norm": 3.015625, "grad_norm_var": 0.0631988525390625, "learning_rate": 0.0001, "loss": 8.4926, "loss/crossentropy": 2.5540969371795654, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.28567659854888916, "step": 1418 }, { "epoch": 0.08875, "grad_norm": 3.0625, "grad_norm_var": 0.13014322916666668, "learning_rate": 0.0001, "loss": 8.3927, "loss/crossentropy": 2.308061122894287, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26119405031204224, "step": 1420 }, { "epoch": 0.088875, "grad_norm": 4.125, "grad_norm_var": 0.22221577962239583, "learning_rate": 0.0001, "loss": 8.3563, "loss/crossentropy": 2.337175130844116, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28924381732940674, "step": 1422 }, { "epoch": 0.089, "grad_norm": 3.0625, "grad_norm_var": 0.205322265625, "learning_rate": 0.0001, "loss": 8.2574, "loss/crossentropy": 2.383028507232666, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2829654812812805, "step": 1424 }, { "epoch": 0.089125, "grad_norm": 2.8125, "grad_norm_var": 0.22934468587239584, "learning_rate": 0.0001, "loss": 8.0822, "loss/crossentropy": 2.2887638807296753, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2677062898874283, "step": 1426 }, { "epoch": 0.08925, "grad_norm": 2.9375, "grad_norm_var": 0.23911031087239584, "learning_rate": 0.0001, "loss": 8.2723, "loss/crossentropy": 2.2472543716430664, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2706802934408188, "step": 1428 }, { "epoch": 0.089375, "grad_norm": 3.3125, "grad_norm_var": 0.23624674479166666, "learning_rate": 0.0001, "loss": 8.378, "loss/crossentropy": 2.517896294593811, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28536301851272583, "step": 1430 }, { "epoch": 0.0895, "grad_norm": 2.984375, "grad_norm_var": 0.23261311848958333, "learning_rate": 0.0001, "loss": 8.1376, "loss/crossentropy": 2.118437886238098, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2512395307421684, "step": 1432 }, { "epoch": 0.089625, "grad_norm": 3.1875, "grad_norm_var": 0.22909749348958333, "learning_rate": 0.0001, "loss": 8.2629, "loss/crossentropy": 2.177670121192932, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2840624302625656, "step": 1434 }, { "epoch": 0.08975, "grad_norm": 3.28125, "grad_norm_var": 0.16071675618489584, "learning_rate": 0.0001, "loss": 8.3297, "loss/crossentropy": 2.422105073928833, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28876344859600067, "step": 1436 }, { "epoch": 0.089875, "grad_norm": 2.65625, "grad_norm_var": 0.03298238118489583, "learning_rate": 0.0001, "loss": 8.3754, "loss/crossentropy": 2.469232678413391, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2923481911420822, "step": 1438 }, { "epoch": 0.09, "grad_norm": 3.03125, "grad_norm_var": 0.0335845947265625, "learning_rate": 0.0001, "loss": 8.2403, "loss/crossentropy": 2.3302817344665527, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26953746378421783, "step": 1440 }, { "epoch": 0.090125, "grad_norm": 3.171875, "grad_norm_var": 0.0303131103515625, "learning_rate": 0.0001, "loss": 8.2342, "loss/crossentropy": 2.1770907640457153, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26305052638053894, "step": 1442 }, { "epoch": 0.09025, "grad_norm": 3.03125, "grad_norm_var": 0.0291412353515625, "learning_rate": 0.0001, "loss": 8.2963, "loss/crossentropy": 2.244715094566345, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2524953857064247, "step": 1444 }, { "epoch": 0.090375, "grad_norm": 3.53125, "grad_norm_var": 0.0806549072265625, "learning_rate": 0.0001, "loss": 8.2455, "loss/crossentropy": 2.1872771978378296, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28185485303401947, "step": 1446 }, { "epoch": 0.0905, "grad_norm": 2.796875, "grad_norm_var": 0.09325764973958334, "learning_rate": 0.0001, "loss": 8.1763, "loss/crossentropy": 2.388404607772827, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25562550872564316, "step": 1448 }, { "epoch": 0.090625, "grad_norm": 3.34375, "grad_norm_var": 0.11158854166666667, "learning_rate": 0.0001, "loss": 8.2801, "loss/crossentropy": 2.2114795446395874, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2575107663869858, "step": 1450 }, { "epoch": 0.09075, "grad_norm": 2.859375, "grad_norm_var": 0.11750895182291667, "learning_rate": 0.0001, "loss": 8.4195, "loss/crossentropy": 2.268153190612793, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2590959519147873, "step": 1452 }, { "epoch": 0.090875, "grad_norm": 3.0, "grad_norm_var": 0.10434468587239583, "learning_rate": 0.0001, "loss": 8.1186, "loss/crossentropy": 2.3246419429779053, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27099407464265823, "step": 1454 }, { "epoch": 0.091, "grad_norm": 3.046875, "grad_norm_var": 0.10079752604166667, "learning_rate": 0.0001, "loss": 8.3338, "loss/crossentropy": 2.5052762031555176, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29280832409858704, "step": 1456 }, { "epoch": 0.091125, "grad_norm": 3.15625, "grad_norm_var": 0.09954325358072917, "learning_rate": 0.0001, "loss": 8.3336, "loss/crossentropy": 1.9449425339698792, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.275674507021904, "step": 1458 }, { "epoch": 0.09125, "grad_norm": 2.8125, "grad_norm_var": 0.10442301432291666, "learning_rate": 0.0001, "loss": 8.3734, "loss/crossentropy": 2.3953585624694824, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28133782744407654, "step": 1460 }, { "epoch": 0.091375, "grad_norm": 3.015625, "grad_norm_var": 0.0473785400390625, "learning_rate": 0.0001, "loss": 8.0915, "loss/crossentropy": 2.2441056966781616, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2531541734933853, "step": 1462 }, { "epoch": 0.0915, "grad_norm": 3.0, "grad_norm_var": 0.04156494140625, "learning_rate": 0.0001, "loss": 8.4787, "loss/crossentropy": 2.4373109340667725, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2879520505666733, "step": 1464 }, { "epoch": 0.091625, "grad_norm": 2.96875, "grad_norm_var": 0.014850870768229166, "learning_rate": 0.0001, "loss": 8.101, "loss/crossentropy": 2.2889362573623657, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25555726885795593, "step": 1466 }, { "epoch": 0.09175, "grad_norm": 3.171875, "grad_norm_var": 0.018778483072916668, "learning_rate": 0.0001, "loss": 8.1266, "loss/crossentropy": 2.2097198963165283, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27271226048469543, "step": 1468 }, { "epoch": 0.091875, "grad_norm": 3.171875, "grad_norm_var": 0.020963541666666665, "learning_rate": 0.0001, "loss": 8.2212, "loss/crossentropy": 2.181049108505249, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26682595908641815, "step": 1470 }, { "epoch": 0.092, "grad_norm": 3.03125, "grad_norm_var": 0.022163899739583333, "learning_rate": 0.0001, "loss": 8.2866, "loss/crossentropy": 2.311566710472107, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24793966859579086, "step": 1472 }, { "epoch": 0.092125, "grad_norm": 3.203125, "grad_norm_var": 0.022704060872395834, "learning_rate": 0.0001, "loss": 8.178, "loss/crossentropy": 2.359419822692871, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26220113039016724, "step": 1474 }, { "epoch": 0.09225, "grad_norm": 3.21875, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 8.4216, "loss/crossentropy": 2.547469735145569, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2959955930709839, "step": 1476 }, { "epoch": 0.092375, "grad_norm": 3.03125, "grad_norm_var": 0.021100870768229165, "learning_rate": 0.0001, "loss": 8.2998, "loss/crossentropy": 2.4008055925369263, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2753802388906479, "step": 1478 }, { "epoch": 0.0925, "grad_norm": 2.671875, "grad_norm_var": 0.028837076822916665, "learning_rate": 0.0001, "loss": 8.0442, "loss/crossentropy": 2.049844443798065, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.25805267691612244, "step": 1480 }, { "epoch": 0.092625, "grad_norm": 3.078125, "grad_norm_var": 0.029100545247395835, "learning_rate": 0.0001, "loss": 8.3892, "loss/crossentropy": 2.3216545581817627, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.27284783124923706, "step": 1482 }, { "epoch": 0.09275, "grad_norm": 2.859375, "grad_norm_var": 0.0282867431640625, "learning_rate": 0.0001, "loss": 8.2379, "loss/crossentropy": 2.3917791843414307, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26631103456020355, "step": 1484 }, { "epoch": 0.092875, "grad_norm": 3.21875, "grad_norm_var": 0.028125, "learning_rate": 0.0001, "loss": 8.5063, "loss/crossentropy": 2.536360025405884, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2958754599094391, "step": 1486 }, { "epoch": 0.093, "grad_norm": 2.984375, "grad_norm_var": 0.0410308837890625, "learning_rate": 0.0001, "loss": 8.3401, "loss/crossentropy": 2.387327551841736, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28280311822891235, "step": 1488 }, { "epoch": 0.093125, "grad_norm": 3.578125, "grad_norm_var": 0.05903218587239583, "learning_rate": 0.0001, "loss": 8.3749, "loss/crossentropy": 2.4892961978912354, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.29599107801914215, "step": 1490 }, { "epoch": 0.09325, "grad_norm": 3.21875, "grad_norm_var": 0.0744781494140625, "learning_rate": 0.0001, "loss": 8.2995, "loss/crossentropy": 2.233021378517151, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.269522100687027, "step": 1492 }, { "epoch": 0.093375, "grad_norm": 2.90625, "grad_norm_var": 0.08401285807291667, "learning_rate": 0.0001, "loss": 8.1287, "loss/crossentropy": 2.2778061628341675, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2637213170528412, "step": 1494 }, { "epoch": 0.0935, "grad_norm": 3.21875, "grad_norm_var": 0.08079020182291667, "learning_rate": 0.0001, "loss": 8.5223, "loss/crossentropy": 2.364134907722473, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2790801376104355, "step": 1496 }, { "epoch": 0.093625, "grad_norm": 2.734375, "grad_norm_var": 0.09123942057291666, "learning_rate": 0.0001, "loss": 8.1725, "loss/crossentropy": 2.0337949991226196, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.244078166782856, "step": 1498 }, { "epoch": 0.09375, "grad_norm": 3.03125, "grad_norm_var": 0.08502197265625, "learning_rate": 0.0001, "loss": 8.3314, "loss/crossentropy": 2.307387113571167, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2793863117694855, "step": 1500 }, { "epoch": 0.093875, "grad_norm": 2.84375, "grad_norm_var": 0.09312744140625, "learning_rate": 0.0001, "loss": 7.9943, "loss/crossentropy": 2.4143176078796387, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26304905116558075, "step": 1502 }, { "epoch": 0.094, "grad_norm": 3.296875, "grad_norm_var": 0.08209228515625, "learning_rate": 0.0001, "loss": 8.3215, "loss/crossentropy": 1.9640471935272217, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24391764402389526, "step": 1504 }, { "epoch": 0.094125, "grad_norm": 3.03125, "grad_norm_var": 0.0652252197265625, "learning_rate": 0.0001, "loss": 8.204, "loss/crossentropy": 2.405478358268738, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2646511495113373, "step": 1506 }, { "epoch": 0.09425, "grad_norm": 2.796875, "grad_norm_var": 0.046126302083333334, "learning_rate": 0.0001, "loss": 8.1211, "loss/crossentropy": 2.0469033122062683, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2384241446852684, "step": 1508 }, { "epoch": 0.094375, "grad_norm": 3.1875, "grad_norm_var": 0.0455230712890625, "learning_rate": 0.0001, "loss": 8.1709, "loss/crossentropy": 2.2403076887130737, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2608010023832321, "step": 1510 }, { "epoch": 0.0945, "grad_norm": 3.328125, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 8.1339, "loss/crossentropy": 2.1499756574630737, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.271525114774704, "step": 1512 }, { "epoch": 0.094625, "grad_norm": 2.734375, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 8.2903, "loss/crossentropy": 2.1646158695220947, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2660486549139023, "step": 1514 }, { "epoch": 0.09475, "grad_norm": 2.984375, "grad_norm_var": 0.05085347493489583, "learning_rate": 0.0001, "loss": 7.9533, "loss/crossentropy": 2.1994398832321167, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2655777484178543, "step": 1516 }, { "epoch": 0.094875, "grad_norm": 3.171875, "grad_norm_var": 0.047484334309895834, "learning_rate": 0.0001, "loss": 8.1863, "loss/crossentropy": 2.0542885661125183, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2492573782801628, "step": 1518 }, { "epoch": 0.095, "grad_norm": 2.84375, "grad_norm_var": 0.046507771809895834, "learning_rate": 0.0001, "loss": 8.0486, "loss/crossentropy": 2.382603883743286, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2706970274448395, "step": 1520 }, { "epoch": 0.095125, "grad_norm": 3.0, "grad_norm_var": 0.04146219889322917, "learning_rate": 0.0001, "loss": 8.288, "loss/crossentropy": 2.2416555881500244, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26504258811473846, "step": 1522 }, { "epoch": 0.09525, "grad_norm": 3.046875, "grad_norm_var": 0.04072265625, "learning_rate": 0.0001, "loss": 8.3359, "loss/crossentropy": 2.483952045440674, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.288076788187027, "step": 1524 }, { "epoch": 0.095375, "grad_norm": 2.921875, "grad_norm_var": 0.03912353515625, "learning_rate": 0.0001, "loss": 8.3248, "loss/crossentropy": 2.6946524381637573, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2882222831249237, "step": 1526 }, { "epoch": 0.0955, "grad_norm": 2.921875, "grad_norm_var": 0.01900634765625, "learning_rate": 0.0001, "loss": 8.2016, "loss/crossentropy": 1.9769355058670044, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2325659841299057, "step": 1528 }, { "epoch": 0.095625, "grad_norm": 2.921875, "grad_norm_var": 0.01513671875, "learning_rate": 0.0001, "loss": 8.2965, "loss/crossentropy": 2.3487859964370728, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26721881330013275, "step": 1530 }, { "epoch": 0.09575, "grad_norm": 3.59375, "grad_norm_var": 0.039534505208333334, "learning_rate": 0.0001, "loss": 8.1814, "loss/crossentropy": 2.2702786922454834, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2642149329185486, "step": 1532 }, { "epoch": 0.095875, "grad_norm": 2.640625, "grad_norm_var": 0.056962076822916666, "learning_rate": 0.0001, "loss": 8.2477, "loss/crossentropy": 2.1052145957946777, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2681325525045395, "step": 1534 }, { "epoch": 0.096, "grad_norm": 3.9375, "grad_norm_var": 0.10074869791666667, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 2.356938362121582, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2789629250764847, "step": 1536 }, { "epoch": 0.096125, "grad_norm": 2.671875, "grad_norm_var": 0.11516825358072917, "learning_rate": 0.0001, "loss": 8.2291, "loss/crossentropy": 2.1644341945648193, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2661993205547333, "step": 1538 }, { "epoch": 0.09625, "grad_norm": 3.34375, "grad_norm_var": 0.12259012858072917, "learning_rate": 0.0001, "loss": 8.1661, "loss/crossentropy": 2.512505531311035, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.283734068274498, "step": 1540 }, { "epoch": 0.096375, "grad_norm": 2.71875, "grad_norm_var": 0.12873942057291668, "learning_rate": 0.0001, "loss": 8.1602, "loss/crossentropy": 2.2558088302612305, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26760005950927734, "step": 1542 }, { "epoch": 0.0965, "grad_norm": 3.0, "grad_norm_var": 0.12698160807291667, "learning_rate": 0.0001, "loss": 8.1477, "loss/crossentropy": 2.2201952934265137, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26513203978538513, "step": 1544 }, { "epoch": 0.096625, "grad_norm": 2.71875, "grad_norm_var": 0.13869527180989583, "learning_rate": 0.0001, "loss": 8.1612, "loss/crossentropy": 2.229737162590027, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2448047399520874, "step": 1546 }, { "epoch": 0.09675, "grad_norm": 3.234375, "grad_norm_var": 0.12079671223958334, "learning_rate": 0.0001, "loss": 8.1329, "loss/crossentropy": 2.3728041648864746, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2712114453315735, "step": 1548 }, { "epoch": 0.096875, "grad_norm": 3.0, "grad_norm_var": 0.10236002604166666, "learning_rate": 0.0001, "loss": 8.2579, "loss/crossentropy": 2.3911492824554443, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.28612037003040314, "step": 1550 }, { "epoch": 0.097, "grad_norm": 3.21875, "grad_norm_var": 0.054011027018229164, "learning_rate": 0.0001, "loss": 8.3094, "loss/crossentropy": 2.3950345516204834, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28530459105968475, "step": 1552 }, { "epoch": 0.097125, "grad_norm": 2.96875, "grad_norm_var": 0.051656087239583336, "learning_rate": 0.0001, "loss": 8.228, "loss/crossentropy": 2.577568531036377, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26758988201618195, "step": 1554 }, { "epoch": 0.09725, "grad_norm": 2.875, "grad_norm_var": 0.0450592041015625, "learning_rate": 0.0001, "loss": 8.2982, "loss/crossentropy": 2.4208312034606934, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2721874862909317, "step": 1556 }, { "epoch": 0.097375, "grad_norm": 2.828125, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 8.1512, "loss/crossentropy": 2.311411142349243, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2621803656220436, "step": 1558 }, { "epoch": 0.0975, "grad_norm": 3.015625, "grad_norm_var": 0.036295572916666664, "learning_rate": 0.0001, "loss": 8.278, "loss/crossentropy": 2.180152475833893, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.255389466881752, "step": 1560 }, { "epoch": 0.097625, "grad_norm": 2.953125, "grad_norm_var": 0.027587890625, "learning_rate": 0.0001, "loss": 8.3438, "loss/crossentropy": 2.5294106006622314, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26509322226047516, "step": 1562 }, { "epoch": 0.09775, "grad_norm": 3.03125, "grad_norm_var": 0.027099609375, "learning_rate": 0.0001, "loss": 8.2816, "loss/crossentropy": 2.1683244705200195, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2431202381849289, "step": 1564 }, { "epoch": 0.097875, "grad_norm": 3.265625, "grad_norm_var": 0.029686482747395833, "learning_rate": 0.0001, "loss": 8.2192, "loss/crossentropy": 2.2188292741775513, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.29626937210559845, "step": 1566 }, { "epoch": 0.098, "grad_norm": 2.8125, "grad_norm_var": 0.0247955322265625, "learning_rate": 0.0001, "loss": 8.2902, "loss/crossentropy": 2.364318370819092, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26975926756858826, "step": 1568 }, { "epoch": 0.098125, "grad_norm": 3.0625, "grad_norm_var": 0.02056884765625, "learning_rate": 0.0001, "loss": 8.3332, "loss/crossentropy": 2.54610013961792, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28531205654144287, "step": 1570 }, { "epoch": 0.09825, "grad_norm": 3.09375, "grad_norm_var": 0.018659464518229165, "learning_rate": 0.0001, "loss": 8.2105, "loss/crossentropy": 2.206403374671936, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24771679937839508, "step": 1572 }, { "epoch": 0.098375, "grad_norm": 3.078125, "grad_norm_var": 0.015913899739583334, "learning_rate": 0.0001, "loss": 8.3768, "loss/crossentropy": 2.3607594966888428, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.28001710772514343, "step": 1574 }, { "epoch": 0.0985, "grad_norm": 3.15625, "grad_norm_var": 0.021581013997395832, "learning_rate": 0.0001, "loss": 8.2712, "loss/crossentropy": 2.2735308408737183, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26137876510620117, "step": 1576 }, { "epoch": 0.098625, "grad_norm": 2.734375, "grad_norm_var": 0.027912394205729166, "learning_rate": 0.0001, "loss": 8.2123, "loss/crossentropy": 2.3609447479248047, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26791180670261383, "step": 1578 }, { "epoch": 0.09875, "grad_norm": 3.234375, "grad_norm_var": 0.028450520833333333, "learning_rate": 0.0001, "loss": 8.3814, "loss/crossentropy": 2.4629390239715576, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.29713912308216095, "step": 1580 }, { "epoch": 0.098875, "grad_norm": 3.171875, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 8.3342, "loss/crossentropy": 2.4477301836013794, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2905379682779312, "step": 1582 }, { "epoch": 0.099, "grad_norm": 2.84375, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 8.1553, "loss/crossentropy": 2.310616612434387, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28200674057006836, "step": 1584 }, { "epoch": 0.099125, "grad_norm": 2.984375, "grad_norm_var": 0.02744140625, "learning_rate": 0.0001, "loss": 8.202, "loss/crossentropy": 2.056865870952606, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2651172876358032, "step": 1586 }, { "epoch": 0.09925, "grad_norm": 3.546875, "grad_norm_var": 0.04753316243489583, "learning_rate": 0.0001, "loss": 8.3285, "loss/crossentropy": 2.222190737724304, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2600644379854202, "step": 1588 }, { "epoch": 0.099375, "grad_norm": 3.03125, "grad_norm_var": 0.05275777180989583, "learning_rate": 0.0001, "loss": 8.3277, "loss/crossentropy": 2.499345541000366, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27901938557624817, "step": 1590 }, { "epoch": 0.0995, "grad_norm": 3.09375, "grad_norm_var": 0.04485270182291667, "learning_rate": 0.0001, "loss": 8.306, "loss/crossentropy": 2.4675090312957764, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27757471799850464, "step": 1592 }, { "epoch": 0.099625, "grad_norm": 3.078125, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 8.1512, "loss/crossentropy": 2.0948686599731445, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24000447988510132, "step": 1594 }, { "epoch": 0.09975, "grad_norm": 3.078125, "grad_norm_var": 0.0365386962890625, "learning_rate": 0.0001, "loss": 8.2796, "loss/crossentropy": 2.2303179502487183, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25618939101696014, "step": 1596 }, { "epoch": 0.099875, "grad_norm": 2.78125, "grad_norm_var": 0.03728841145833333, "learning_rate": 0.0001, "loss": 8.2309, "loss/crossentropy": 2.172394037246704, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26014433801174164, "step": 1598 }, { "epoch": 0.1, "grad_norm": 3.0, "grad_norm_var": 0.0431640625, "learning_rate": 0.0001, "loss": 8.0142, "loss/crossentropy": 2.0257323384284973, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23609444499015808, "step": 1600 }, { "epoch": 0.100125, "grad_norm": 2.8125, "grad_norm_var": 0.044514973958333336, "learning_rate": 0.0001, "loss": 8.0713, "loss/crossentropy": 2.1410731077194214, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26623016595840454, "step": 1602 }, { "epoch": 0.10025, "grad_norm": 2.984375, "grad_norm_var": 0.021491495768229167, "learning_rate": 0.0001, "loss": 8.0494, "loss/crossentropy": 2.3695082664489746, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2746554762125015, "step": 1604 }, { "epoch": 0.100375, "grad_norm": 2.90625, "grad_norm_var": 0.021637980143229166, "learning_rate": 0.0001, "loss": 8.1369, "loss/crossentropy": 2.2459890842437744, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25808124244213104, "step": 1606 }, { "epoch": 0.1005, "grad_norm": 3.25, "grad_norm_var": 0.023591105143229166, "learning_rate": 0.0001, "loss": 8.3249, "loss/crossentropy": 2.4524621963500977, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2866296321153641, "step": 1608 }, { "epoch": 0.100625, "grad_norm": 3.0, "grad_norm_var": 0.023128255208333334, "learning_rate": 0.0001, "loss": 8.2293, "loss/crossentropy": 2.3052438497543335, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27561667561531067, "step": 1610 }, { "epoch": 0.10075, "grad_norm": 2.71875, "grad_norm_var": 0.021174112955729168, "learning_rate": 0.0001, "loss": 8.1306, "loss/crossentropy": 2.477377772331238, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26571913063526154, "step": 1612 }, { "epoch": 0.100875, "grad_norm": 2.90625, "grad_norm_var": 0.1718414306640625, "learning_rate": 0.0001, "loss": 8.4299, "loss/crossentropy": 2.2197115421295166, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2796938568353653, "step": 1614 }, { "epoch": 0.101, "grad_norm": 3.078125, "grad_norm_var": 0.17489827473958333, "learning_rate": 0.0001, "loss": 8.2917, "loss/crossentropy": 2.2831382751464844, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2742185890674591, "step": 1616 }, { "epoch": 0.101125, "grad_norm": 3.0625, "grad_norm_var": 0.17009989420572916, "learning_rate": 0.0001, "loss": 8.2352, "loss/crossentropy": 2.2610585689544678, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26356393098831177, "step": 1618 }, { "epoch": 0.10125, "grad_norm": 3.046875, "grad_norm_var": 0.16988525390625, "learning_rate": 0.0001, "loss": 8.2457, "loss/crossentropy": 2.2453945875167847, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2588353157043457, "step": 1620 }, { "epoch": 0.101375, "grad_norm": 3.203125, "grad_norm_var": 0.16035054524739584, "learning_rate": 0.0001, "loss": 8.2079, "loss/crossentropy": 2.2290210723876953, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27493180334568024, "step": 1622 }, { "epoch": 0.1015, "grad_norm": 3.0, "grad_norm_var": 0.15942281087239582, "learning_rate": 0.0001, "loss": 8.3946, "loss/crossentropy": 2.3389216661453247, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2627445012331009, "step": 1624 }, { "epoch": 0.101625, "grad_norm": 3.625, "grad_norm_var": 0.32203776041666665, "learning_rate": 0.0001, "loss": 8.2128, "loss/crossentropy": 2.193161904811859, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2609563320875168, "step": 1626 }, { "epoch": 0.10175, "grad_norm": 2.9375, "grad_norm_var": 0.29754231770833334, "learning_rate": 0.0001, "loss": 8.2789, "loss/crossentropy": 2.4037156105041504, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2840966284275055, "step": 1628 }, { "epoch": 0.101875, "grad_norm": 14.4375, "grad_norm_var": 8.05601298014323, "learning_rate": 0.0001, "loss": 8.7256, "loss/crossentropy": 2.2983932495117188, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2895353436470032, "step": 1630 }, { "epoch": 0.102, "grad_norm": 3.359375, "grad_norm_var": 8.257710774739584, "learning_rate": 0.0001, "loss": 8.5242, "loss/crossentropy": 2.5548095703125, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.29210618138313293, "step": 1632 }, { "epoch": 0.102125, "grad_norm": 3.015625, "grad_norm_var": 8.2908203125, "learning_rate": 0.0001, "loss": 8.2313, "loss/crossentropy": 2.1125290393829346, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26304440200328827, "step": 1634 }, { "epoch": 0.10225, "grad_norm": 3.0, "grad_norm_var": 8.321890258789063, "learning_rate": 0.0001, "loss": 8.3129, "loss/crossentropy": 2.4635796546936035, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.289856493473053, "step": 1636 }, { "epoch": 0.102375, "grad_norm": 3.359375, "grad_norm_var": 8.303043619791667, "learning_rate": 0.0001, "loss": 8.3342, "loss/crossentropy": 2.4671066999435425, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.30989140272140503, "step": 1638 }, { "epoch": 0.1025, "grad_norm": 2.859375, "grad_norm_var": 8.330631510416667, "learning_rate": 0.0001, "loss": 8.3266, "loss/crossentropy": 2.2032480239868164, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28856223821640015, "step": 1640 }, { "epoch": 0.102625, "grad_norm": 2.875, "grad_norm_var": 8.449762980143229, "learning_rate": 0.0001, "loss": 8.0619, "loss/crossentropy": 2.384071946144104, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27716881036758423, "step": 1642 }, { "epoch": 0.10275, "grad_norm": 4.25, "grad_norm_var": 8.37940165201823, "learning_rate": 0.0001, "loss": 8.2985, "loss/crossentropy": 2.394433617591858, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27403971552848816, "step": 1644 }, { "epoch": 0.102875, "grad_norm": 3.0, "grad_norm_var": 0.6259073893229167, "learning_rate": 0.0001, "loss": 8.1275, "loss/crossentropy": 2.5770705938339233, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2827526032924652, "step": 1646 }, { "epoch": 0.103, "grad_norm": 2.9375, "grad_norm_var": 0.11741434733072917, "learning_rate": 0.0001, "loss": 8.1994, "loss/crossentropy": 2.203721523284912, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2632629871368408, "step": 1648 }, { "epoch": 0.103125, "grad_norm": 2.78125, "grad_norm_var": 0.11988525390625, "learning_rate": 0.0001, "loss": 8.27, "loss/crossentropy": 2.190592408180237, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2594703510403633, "step": 1650 }, { "epoch": 0.10325, "grad_norm": 3.109375, "grad_norm_var": 0.122119140625, "learning_rate": 0.0001, "loss": 8.3436, "loss/crossentropy": 2.3094884157180786, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2539513558149338, "step": 1652 }, { "epoch": 0.103375, "grad_norm": 2.8125, "grad_norm_var": 0.12323811848958334, "learning_rate": 0.0001, "loss": 8.2993, "loss/crossentropy": 2.2542308568954468, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27265970408916473, "step": 1654 }, { "epoch": 0.1035, "grad_norm": 3.65625, "grad_norm_var": 0.14322916666666666, "learning_rate": 0.0001, "loss": 8.2358, "loss/crossentropy": 2.150593101978302, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26226024329662323, "step": 1656 }, { "epoch": 0.103625, "grad_norm": 2.796875, "grad_norm_var": 0.15575764973958334, "learning_rate": 0.0001, "loss": 8.2424, "loss/crossentropy": 2.3359317779541016, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2621028572320938, "step": 1658 }, { "epoch": 0.10375, "grad_norm": 3.125, "grad_norm_var": 0.07243550618489583, "learning_rate": 0.0001, "loss": 8.209, "loss/crossentropy": 2.3354294300079346, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28642134368419647, "step": 1660 }, { "epoch": 0.103875, "grad_norm": 2.890625, "grad_norm_var": 0.07413736979166667, "learning_rate": 0.0001, "loss": 8.0643, "loss/crossentropy": 2.2628813982009888, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2720167338848114, "step": 1662 }, { "epoch": 0.104, "grad_norm": 2.640625, "grad_norm_var": 0.08414713541666667, "learning_rate": 0.0001, "loss": 8.2252, "loss/crossentropy": 2.405531644821167, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2855593413114548, "step": 1664 }, { "epoch": 0.104125, "grad_norm": 3.453125, "grad_norm_var": 0.09845377604166666, "learning_rate": 0.0001, "loss": 8.2726, "loss/crossentropy": 2.348948836326599, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2590363919734955, "step": 1666 }, { "epoch": 0.10425, "grad_norm": 2.6875, "grad_norm_var": 0.108740234375, "learning_rate": 0.0001, "loss": 8.0444, "loss/crossentropy": 2.275284171104431, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2520073354244232, "step": 1668 }, { "epoch": 0.104375, "grad_norm": 3.171875, "grad_norm_var": 0.10839436848958334, "learning_rate": 0.0001, "loss": 7.9805, "loss/crossentropy": 2.160663425922394, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25372669100761414, "step": 1670 }, { "epoch": 0.1045, "grad_norm": 3.1875, "grad_norm_var": 0.0904205322265625, "learning_rate": 0.0001, "loss": 8.2607, "loss/crossentropy": 2.2359933853149414, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2621243894100189, "step": 1672 }, { "epoch": 0.104625, "grad_norm": 2.78125, "grad_norm_var": 0.06806233723958334, "learning_rate": 0.0001, "loss": 7.9523, "loss/crossentropy": 1.9437886476516724, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2611730396747589, "step": 1674 }, { "epoch": 0.10475, "grad_norm": 2.859375, "grad_norm_var": 0.06712137858072917, "learning_rate": 0.0001, "loss": 8.0401, "loss/crossentropy": 2.288792371749878, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2550469785928726, "step": 1676 }, { "epoch": 0.104875, "grad_norm": 2.96875, "grad_norm_var": 0.07136128743489584, "learning_rate": 0.0001, "loss": 8.2327, "loss/crossentropy": 2.1720248460769653, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24942373484373093, "step": 1678 }, { "epoch": 0.105, "grad_norm": 2.65625, "grad_norm_var": 0.06942952473958333, "learning_rate": 0.0001, "loss": 8.2368, "loss/crossentropy": 2.2941821813583374, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2739366888999939, "step": 1680 }, { "epoch": 0.105125, "grad_norm": 3.0625, "grad_norm_var": 0.0464508056640625, "learning_rate": 0.0001, "loss": 8.234, "loss/crossentropy": 2.255189538002014, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2702263593673706, "step": 1682 }, { "epoch": 0.10525, "grad_norm": 3.078125, "grad_norm_var": 0.0395660400390625, "learning_rate": 0.0001, "loss": 8.1081, "loss/crossentropy": 2.419864535331726, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26446475088596344, "step": 1684 }, { "epoch": 0.105375, "grad_norm": 2.734375, "grad_norm_var": 0.041559855143229164, "learning_rate": 0.0001, "loss": 8.3004, "loss/crossentropy": 2.329113721847534, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2699923515319824, "step": 1686 }, { "epoch": 0.1055, "grad_norm": 2.890625, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 8.0978, "loss/crossentropy": 2.290674090385437, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2571800425648689, "step": 1688 }, { "epoch": 0.105625, "grad_norm": 2.703125, "grad_norm_var": 0.031787109375, "learning_rate": 0.0001, "loss": 7.992, "loss/crossentropy": 2.194283127784729, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2518819496035576, "step": 1690 }, { "epoch": 0.10575, "grad_norm": 3.625, "grad_norm_var": 0.06621805826822917, "learning_rate": 0.0001, "loss": 8.0845, "loss/crossentropy": 2.3665404319763184, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2546495646238327, "step": 1692 }, { "epoch": 0.105875, "grad_norm": 2.796875, "grad_norm_var": 0.06741536458333333, "learning_rate": 0.0001, "loss": 8.2148, "loss/crossentropy": 2.6665724515914917, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26855309307575226, "step": 1694 }, { "epoch": 0.106, "grad_norm": 2.59375, "grad_norm_var": 0.06809488932291667, "learning_rate": 0.0001, "loss": 8.0213, "loss/crossentropy": 2.0011618733406067, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2479224056005478, "step": 1696 }, { "epoch": 0.106125, "grad_norm": 3.0, "grad_norm_var": 0.06813151041666667, "learning_rate": 0.0001, "loss": 8.1117, "loss/crossentropy": 2.233310341835022, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26155102252960205, "step": 1698 }, { "epoch": 0.10625, "grad_norm": 3.078125, "grad_norm_var": 0.067138671875, "learning_rate": 0.0001, "loss": 8.1435, "loss/crossentropy": 2.023264706134796, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.23082198202610016, "step": 1700 }, { "epoch": 0.106375, "grad_norm": 2.984375, "grad_norm_var": 0.06722005208333333, "learning_rate": 0.0001, "loss": 8.0233, "loss/crossentropy": 2.2503018379211426, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24713517725467682, "step": 1702 }, { "epoch": 0.1065, "grad_norm": 2.796875, "grad_norm_var": 0.07294820149739584, "learning_rate": 0.0001, "loss": 7.9907, "loss/crossentropy": 2.557657241821289, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26641707122325897, "step": 1704 }, { "epoch": 0.106625, "grad_norm": 2.78125, "grad_norm_var": 0.06492411295572917, "learning_rate": 0.0001, "loss": 7.9908, "loss/crossentropy": 2.2161107063293457, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2457757443189621, "step": 1706 }, { "epoch": 0.10675, "grad_norm": 3.15625, "grad_norm_var": 0.041047159830729166, "learning_rate": 0.0001, "loss": 8.3599, "loss/crossentropy": 2.307586431503296, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25822295993566513, "step": 1708 }, { "epoch": 0.106875, "grad_norm": 2.875, "grad_norm_var": 0.03759765625, "learning_rate": 0.0001, "loss": 8.1595, "loss/crossentropy": 2.215519666671753, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2511584535241127, "step": 1710 }, { "epoch": 0.107, "grad_norm": 3.125, "grad_norm_var": 0.03365478515625, "learning_rate": 0.0001, "loss": 8.3037, "loss/crossentropy": 2.292387008666992, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2549327313899994, "step": 1712 }, { "epoch": 0.107125, "grad_norm": 2.9375, "grad_norm_var": 0.03853251139322917, "learning_rate": 0.0001, "loss": 8.1552, "loss/crossentropy": 2.573517322540283, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27707283198833466, "step": 1714 }, { "epoch": 0.10725, "grad_norm": 3.0625, "grad_norm_var": 0.03942057291666667, "learning_rate": 0.0001, "loss": 8.1879, "loss/crossentropy": 2.4223859310150146, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26580674946308136, "step": 1716 }, { "epoch": 0.107375, "grad_norm": 2.828125, "grad_norm_var": 0.03720296223958333, "learning_rate": 0.0001, "loss": 8.2149, "loss/crossentropy": 2.5507869720458984, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2808096259832382, "step": 1718 }, { "epoch": 0.1075, "grad_norm": 5.46875, "grad_norm_var": 0.42760009765625, "learning_rate": 0.0001, "loss": 8.2746, "loss/crossentropy": 2.4066158533096313, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27704988420009613, "step": 1720 }, { "epoch": 0.107625, "grad_norm": 3.1875, "grad_norm_var": 0.42789306640625, "learning_rate": 0.0001, "loss": 8.1194, "loss/crossentropy": 2.3192564249038696, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2466306835412979, "step": 1722 }, { "epoch": 0.10775, "grad_norm": 2.828125, "grad_norm_var": 0.4310943603515625, "learning_rate": 0.0001, "loss": 8.1758, "loss/crossentropy": 2.2524945735931396, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26875488460063934, "step": 1724 }, { "epoch": 0.107875, "grad_norm": 2.921875, "grad_norm_var": 0.4318359375, "learning_rate": 0.0001, "loss": 8.27, "loss/crossentropy": 2.2209118604660034, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25894972681999207, "step": 1726 }, { "epoch": 0.108, "grad_norm": 3.0625, "grad_norm_var": 0.43041890462239585, "learning_rate": 0.0001, "loss": 8.2942, "loss/crossentropy": 2.156678080558777, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24751365184783936, "step": 1728 }, { "epoch": 0.108125, "grad_norm": 2.96875, "grad_norm_var": 0.42939453125, "learning_rate": 0.0001, "loss": 7.9607, "loss/crossentropy": 2.1267510652542114, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24761785566806793, "step": 1730 }, { "epoch": 0.10825, "grad_norm": 2.6875, "grad_norm_var": 0.4383941650390625, "learning_rate": 0.0001, "loss": 7.9771, "loss/crossentropy": 2.300544857978821, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23868989944458008, "step": 1732 }, { "epoch": 0.108375, "grad_norm": 2.890625, "grad_norm_var": 0.45806884765625, "learning_rate": 0.0001, "loss": 8.1917, "loss/crossentropy": 2.360519051551819, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26660336554050446, "step": 1734 }, { "epoch": 0.1085, "grad_norm": 2.859375, "grad_norm_var": 0.06379292805989584, "learning_rate": 0.0001, "loss": 8.1922, "loss/crossentropy": 2.1705552339553833, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2528446614742279, "step": 1736 }, { "epoch": 0.108625, "grad_norm": 3.0625, "grad_norm_var": 0.034357706705729164, "learning_rate": 0.0001, "loss": 8.1198, "loss/crossentropy": 2.299151659011841, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25353947281837463, "step": 1738 }, { "epoch": 0.10875, "grad_norm": 2.53125, "grad_norm_var": 0.045210774739583334, "learning_rate": 0.0001, "loss": 8.2351, "loss/crossentropy": 2.172037899494171, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2686986029148102, "step": 1740 }, { "epoch": 0.108875, "grad_norm": 3.3125, "grad_norm_var": 0.0550201416015625, "learning_rate": 0.0001, "loss": 8.0653, "loss/crossentropy": 2.3453006744384766, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2615263909101486, "step": 1742 }, { "epoch": 0.109, "grad_norm": 2.59375, "grad_norm_var": 0.06112874348958333, "learning_rate": 0.0001, "loss": 8.0488, "loss/crossentropy": 2.4043914079666138, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2809004932641983, "step": 1744 }, { "epoch": 0.109125, "grad_norm": 3.296875, "grad_norm_var": 0.06349995930989584, "learning_rate": 0.0001, "loss": 8.3458, "loss/crossentropy": 2.3624587059020996, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.29849672317504883, "step": 1746 }, { "epoch": 0.10925, "grad_norm": 3.25, "grad_norm_var": 0.06689453125, "learning_rate": 0.0001, "loss": 8.2422, "loss/crossentropy": 2.299923300743103, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2804667204618454, "step": 1748 }, { "epoch": 0.109375, "grad_norm": 2.625, "grad_norm_var": 0.06067301432291667, "learning_rate": 0.0001, "loss": 8.0319, "loss/crossentropy": 2.017127275466919, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.236178919672966, "step": 1750 }, { "epoch": 0.1095, "grad_norm": 2.875, "grad_norm_var": 0.05732014973958333, "learning_rate": 0.0001, "loss": 8.2062, "loss/crossentropy": 2.665374517440796, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28587816655635834, "step": 1752 }, { "epoch": 0.109625, "grad_norm": 2.609375, "grad_norm_var": 0.07112528483072916, "learning_rate": 0.0001, "loss": 8.129, "loss/crossentropy": 2.2756296396255493, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2523636817932129, "step": 1754 }, { "epoch": 0.10975, "grad_norm": 2.765625, "grad_norm_var": 0.058259073893229166, "learning_rate": 0.0001, "loss": 7.9899, "loss/crossentropy": 2.066028356552124, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23234106600284576, "step": 1756 }, { "epoch": 0.109875, "grad_norm": 2.796875, "grad_norm_var": 0.05266520182291667, "learning_rate": 0.0001, "loss": 8.1501, "loss/crossentropy": 2.307652711868286, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26358823478221893, "step": 1758 }, { "epoch": 0.11, "grad_norm": 2.84375, "grad_norm_var": 0.0484375, "learning_rate": 0.0001, "loss": 8.0521, "loss/crossentropy": 2.00510311126709, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2376161813735962, "step": 1760 }, { "epoch": 0.110125, "grad_norm": 2.765625, "grad_norm_var": 0.04407450358072917, "learning_rate": 0.0001, "loss": 8.1974, "loss/crossentropy": 2.4039831161499023, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27686847746372223, "step": 1762 }, { "epoch": 0.11025, "grad_norm": 2.8125, "grad_norm_var": 0.03303934733072917, "learning_rate": 0.0001, "loss": 8.0579, "loss/crossentropy": 2.3060104846954346, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2945534288883209, "step": 1764 }, { "epoch": 0.110375, "grad_norm": 3.40625, "grad_norm_var": 0.050047810872395834, "learning_rate": 0.0001, "loss": 8.0388, "loss/crossentropy": 2.4445682764053345, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27699966728687286, "step": 1766 }, { "epoch": 0.1105, "grad_norm": 2.75, "grad_norm_var": 0.05078837076822917, "learning_rate": 0.0001, "loss": 8.2157, "loss/crossentropy": 2.4216455221176147, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28081244230270386, "step": 1768 }, { "epoch": 0.110625, "grad_norm": 2.859375, "grad_norm_var": 0.035521443684895834, "learning_rate": 0.0001, "loss": 8.0611, "loss/crossentropy": 1.7756622433662415, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2289479374885559, "step": 1770 }, { "epoch": 0.11075, "grad_norm": 2.9375, "grad_norm_var": 0.03371480305989583, "learning_rate": 0.0001, "loss": 7.9772, "loss/crossentropy": 2.1726499795913696, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2653361111879349, "step": 1772 }, { "epoch": 0.110875, "grad_norm": 2.984375, "grad_norm_var": 0.034016927083333336, "learning_rate": 0.0001, "loss": 8.0615, "loss/crossentropy": 2.3147062063217163, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27281494438648224, "step": 1774 }, { "epoch": 0.111, "grad_norm": 3.09375, "grad_norm_var": 0.0338775634765625, "learning_rate": 0.0001, "loss": 8.1474, "loss/crossentropy": 2.1700609922409058, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25692617893218994, "step": 1776 }, { "epoch": 0.111125, "grad_norm": 2.8125, "grad_norm_var": 0.0308990478515625, "learning_rate": 0.0001, "loss": 8.0753, "loss/crossentropy": 2.114788770675659, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2623438090085983, "step": 1778 }, { "epoch": 0.11125, "grad_norm": 3.0625, "grad_norm_var": 0.03291727701822917, "learning_rate": 0.0001, "loss": 8.232, "loss/crossentropy": 2.3088366985321045, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25789759308099747, "step": 1780 }, { "epoch": 0.111375, "grad_norm": 2.828125, "grad_norm_var": 0.0161773681640625, "learning_rate": 0.0001, "loss": 8.1283, "loss/crossentropy": 2.021032750606537, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26125916838645935, "step": 1782 }, { "epoch": 0.1115, "grad_norm": 2.8125, "grad_norm_var": 0.016520182291666668, "learning_rate": 0.0001, "loss": 8.3159, "loss/crossentropy": 2.546655535697937, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26530279219150543, "step": 1784 }, { "epoch": 0.111625, "grad_norm": 2.703125, "grad_norm_var": 0.018094889322916665, "learning_rate": 0.0001, "loss": 8.0182, "loss/crossentropy": 2.1505188941955566, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23876308649778366, "step": 1786 }, { "epoch": 0.11175, "grad_norm": 2.875, "grad_norm_var": 0.019880167643229165, "learning_rate": 0.0001, "loss": 8.0242, "loss/crossentropy": 2.1757689714431763, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2557087540626526, "step": 1788 }, { "epoch": 0.111875, "grad_norm": 2.953125, "grad_norm_var": 0.01763916015625, "learning_rate": 0.0001, "loss": 8.0262, "loss/crossentropy": 2.28451144695282, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26241403818130493, "step": 1790 }, { "epoch": 0.112, "grad_norm": 2.765625, "grad_norm_var": 0.016145833333333335, "learning_rate": 0.0001, "loss": 8.0235, "loss/crossentropy": 2.11034619808197, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2549893856048584, "step": 1792 }, { "epoch": 0.112125, "grad_norm": 2.828125, "grad_norm_var": 0.014969889322916667, "learning_rate": 0.0001, "loss": 8.076, "loss/crossentropy": 2.0472227931022644, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2562412843108177, "step": 1794 }, { "epoch": 0.11225, "grad_norm": 2.796875, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 7.928, "loss/crossentropy": 2.315675735473633, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2601415067911148, "step": 1796 }, { "epoch": 0.112375, "grad_norm": 2.953125, "grad_norm_var": 0.022443644205729165, "learning_rate": 0.0001, "loss": 7.8579, "loss/crossentropy": 2.0402532815933228, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2341025322675705, "step": 1798 }, { "epoch": 0.1125, "grad_norm": 2.78125, "grad_norm_var": 0.02135009765625, "learning_rate": 0.0001, "loss": 8.2218, "loss/crossentropy": 2.562678098678589, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2757340967655182, "step": 1800 }, { "epoch": 0.112625, "grad_norm": 3.140625, "grad_norm_var": 0.024657185872395834, "learning_rate": 0.0001, "loss": 8.1885, "loss/crossentropy": 2.0969003438949585, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2326880842447281, "step": 1802 }, { "epoch": 0.11275, "grad_norm": 2.734375, "grad_norm_var": 0.030720011393229166, "learning_rate": 0.0001, "loss": 8.1163, "loss/crossentropy": 2.332270383834839, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25439298152923584, "step": 1804 }, { "epoch": 0.112875, "grad_norm": 3.125, "grad_norm_var": 0.03774312337239583, "learning_rate": 0.0001, "loss": 8.106, "loss/crossentropy": 2.162129521369934, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26606719195842743, "step": 1806 }, { "epoch": 0.113, "grad_norm": 3.109375, "grad_norm_var": 0.0377593994140625, "learning_rate": 0.0001, "loss": 8.1417, "loss/crossentropy": 2.2731558084487915, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2553500384092331, "step": 1808 }, { "epoch": 0.113125, "grad_norm": 2.6875, "grad_norm_var": 0.0411529541015625, "learning_rate": 0.0001, "loss": 8.0807, "loss/crossentropy": 2.10029274225235, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23542343080043793, "step": 1810 }, { "epoch": 0.11325, "grad_norm": 2.875, "grad_norm_var": 0.04257405598958333, "learning_rate": 0.0001, "loss": 8.1634, "loss/crossentropy": 2.426058769226074, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2891880124807358, "step": 1812 }, { "epoch": 0.113375, "grad_norm": 3.0625, "grad_norm_var": 0.033154296875, "learning_rate": 0.0001, "loss": 8.0769, "loss/crossentropy": 2.4051181077957153, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2494017630815506, "step": 1814 }, { "epoch": 0.1135, "grad_norm": 2.671875, "grad_norm_var": 0.03585611979166667, "learning_rate": 0.0001, "loss": 8.0919, "loss/crossentropy": 2.0648642778396606, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23387089371681213, "step": 1816 }, { "epoch": 0.113625, "grad_norm": 3.25, "grad_norm_var": 0.0592681884765625, "learning_rate": 0.0001, "loss": 8.078, "loss/crossentropy": 1.989893615245819, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23796076327562332, "step": 1818 }, { "epoch": 0.11375, "grad_norm": 2.828125, "grad_norm_var": 0.0541656494140625, "learning_rate": 0.0001, "loss": 7.9998, "loss/crossentropy": 1.9235325455665588, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.22104668617248535, "step": 1820 }, { "epoch": 0.113875, "grad_norm": 2.828125, "grad_norm_var": 0.05319010416666667, "learning_rate": 0.0001, "loss": 8.0283, "loss/crossentropy": 2.39365816116333, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25356078147888184, "step": 1822 }, { "epoch": 0.114, "grad_norm": 2.765625, "grad_norm_var": 0.05628153483072917, "learning_rate": 0.0001, "loss": 8.0969, "loss/crossentropy": 2.2098069190979004, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2322394847869873, "step": 1824 }, { "epoch": 0.114125, "grad_norm": 2.59375, "grad_norm_var": 0.05894266764322917, "learning_rate": 0.0001, "loss": 8.0474, "loss/crossentropy": 2.3926165103912354, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26291845738887787, "step": 1826 }, { "epoch": 0.11425, "grad_norm": 2.90625, "grad_norm_var": 0.057494099934895834, "learning_rate": 0.0001, "loss": 8.0912, "loss/crossentropy": 2.271665573120117, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28214675188064575, "step": 1828 }, { "epoch": 0.114375, "grad_norm": 2.96875, "grad_norm_var": 0.055826822916666664, "learning_rate": 0.0001, "loss": 8.2937, "loss/crossentropy": 2.3243162631988525, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25888554751873016, "step": 1830 }, { "epoch": 0.1145, "grad_norm": 2.828125, "grad_norm_var": 0.05383707682291667, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 2.4674028158187866, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2678636610507965, "step": 1832 }, { "epoch": 0.114625, "grad_norm": 2.90625, "grad_norm_var": 0.023795572916666667, "learning_rate": 0.0001, "loss": 8.2065, "loss/crossentropy": 2.2178725004196167, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2575995400547981, "step": 1834 }, { "epoch": 0.11475, "grad_norm": 2.90625, "grad_norm_var": 0.023444620768229167, "learning_rate": 0.0001, "loss": 8.086, "loss/crossentropy": 2.173088550567627, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25725461542606354, "step": 1836 }, { "epoch": 0.114875, "grad_norm": 2.953125, "grad_norm_var": 0.015576171875, "learning_rate": 0.0001, "loss": 8.013, "loss/crossentropy": 2.167203664779663, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2551988810300827, "step": 1838 }, { "epoch": 0.115, "grad_norm": 2.828125, "grad_norm_var": 0.011714680989583334, "learning_rate": 0.0001, "loss": 8.0137, "loss/crossentropy": 2.324142336845398, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25543810427188873, "step": 1840 }, { "epoch": 0.115125, "grad_norm": 2.6875, "grad_norm_var": 0.008675130208333333, "learning_rate": 0.0001, "loss": 7.8792, "loss/crossentropy": 2.3638752698898315, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26999443769454956, "step": 1842 }, { "epoch": 0.11525, "grad_norm": 2.859375, "grad_norm_var": 0.014351399739583333, "learning_rate": 0.0001, "loss": 8.0577, "loss/crossentropy": 2.236335277557373, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24563505500555038, "step": 1844 }, { "epoch": 0.115375, "grad_norm": 2.78125, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 8.0579, "loss/crossentropy": 2.3817098140716553, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25307345390319824, "step": 1846 }, { "epoch": 0.1155, "grad_norm": 2.71875, "grad_norm_var": 0.016373697916666666, "learning_rate": 0.0001, "loss": 7.796, "loss/crossentropy": 2.2049105167388916, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2526453882455826, "step": 1848 }, { "epoch": 0.115625, "grad_norm": 2.984375, "grad_norm_var": 0.019429524739583332, "learning_rate": 0.0001, "loss": 8.2624, "loss/crossentropy": 2.544666051864624, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27838000655174255, "step": 1850 }, { "epoch": 0.11575, "grad_norm": 2.859375, "grad_norm_var": 0.0191070556640625, "learning_rate": 0.0001, "loss": 8.0722, "loss/crossentropy": 2.4957003593444824, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25582027435302734, "step": 1852 }, { "epoch": 0.115875, "grad_norm": 3.21875, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 7.9738, "loss/crossentropy": 2.2852269411087036, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27101629972457886, "step": 1854 }, { "epoch": 0.116, "grad_norm": 2.953125, "grad_norm_var": 0.030790201822916665, "learning_rate": 0.0001, "loss": 7.9897, "loss/crossentropy": 2.1064014434814453, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24057136476039886, "step": 1856 }, { "epoch": 0.116125, "grad_norm": 3.140625, "grad_norm_var": 0.03762105305989583, "learning_rate": 0.0001, "loss": 8.1089, "loss/crossentropy": 2.352795124053955, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23926259577274323, "step": 1858 }, { "epoch": 0.11625, "grad_norm": 2.484375, "grad_norm_var": 0.04638671875, "learning_rate": 0.0001, "loss": 7.8246, "loss/crossentropy": 2.085222840309143, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.21554403752088547, "step": 1860 }, { "epoch": 0.116375, "grad_norm": 2.65625, "grad_norm_var": 0.0461578369140625, "learning_rate": 0.0001, "loss": 7.9895, "loss/crossentropy": 1.9475982785224915, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2195342779159546, "step": 1862 }, { "epoch": 0.1165, "grad_norm": 2.703125, "grad_norm_var": 0.04849853515625, "learning_rate": 0.0001, "loss": 7.9607, "loss/crossentropy": 2.4439034461975098, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27569329738616943, "step": 1864 }, { "epoch": 0.116625, "grad_norm": 3.328125, "grad_norm_var": 0.06008199055989583, "learning_rate": 0.0001, "loss": 7.9813, "loss/crossentropy": 2.3087748289108276, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2708819806575775, "step": 1866 }, { "epoch": 0.11675, "grad_norm": 2.859375, "grad_norm_var": 0.06638895670572917, "learning_rate": 0.0001, "loss": 8.0842, "loss/crossentropy": 2.2168221473693848, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2662223279476166, "step": 1868 }, { "epoch": 0.116875, "grad_norm": 2.59375, "grad_norm_var": 0.06785380045572917, "learning_rate": 0.0001, "loss": 8.0107, "loss/crossentropy": 2.008695662021637, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24834764003753662, "step": 1870 }, { "epoch": 0.117, "grad_norm": 3.109375, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 8.1764, "loss/crossentropy": 2.2949352860450745, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23978671431541443, "step": 1872 }, { "epoch": 0.117125, "grad_norm": 2.8125, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 7.9191, "loss/crossentropy": 2.1248743534088135, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24373694509267807, "step": 1874 }, { "epoch": 0.11725, "grad_norm": 5.25, "grad_norm_var": 0.4083984375, "learning_rate": 0.0001, "loss": 8.1591, "loss/crossentropy": 2.4608160257339478, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2699667811393738, "step": 1876 }, { "epoch": 0.117375, "grad_norm": 3.765625, "grad_norm_var": 0.43925679524739586, "learning_rate": 0.0001, "loss": 8.2212, "loss/crossentropy": 2.192078948020935, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26026079058647156, "step": 1878 }, { "epoch": 0.1175, "grad_norm": 3.25, "grad_norm_var": 0.4112701416015625, "learning_rate": 0.0001, "loss": 7.974, "loss/crossentropy": 2.079145610332489, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2560829073190689, "step": 1880 }, { "epoch": 0.117625, "grad_norm": 3.078125, "grad_norm_var": 0.4034657796223958, "learning_rate": 0.0001, "loss": 8.0137, "loss/crossentropy": 2.2801836133003235, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2451099008321762, "step": 1882 }, { "epoch": 0.11775, "grad_norm": 2.734375, "grad_norm_var": 0.43884175618489585, "learning_rate": 0.0001, "loss": 8.0881, "loss/crossentropy": 2.373893141746521, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26466645300388336, "step": 1884 }, { "epoch": 0.117875, "grad_norm": 3.0, "grad_norm_var": 0.413330078125, "learning_rate": 0.0001, "loss": 7.9685, "loss/crossentropy": 2.411695957183838, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2512810304760933, "step": 1886 }, { "epoch": 0.118, "grad_norm": 2.640625, "grad_norm_var": 0.42668355305989586, "learning_rate": 0.0001, "loss": 8.0557, "loss/crossentropy": 2.049705147743225, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2417662888765335, "step": 1888 }, { "epoch": 0.118125, "grad_norm": 2.890625, "grad_norm_var": 0.44470113118489585, "learning_rate": 0.0001, "loss": 8.0032, "loss/crossentropy": 2.1323426961898804, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23288051038980484, "step": 1890 }, { "epoch": 0.11825, "grad_norm": 3.203125, "grad_norm_var": 0.1299957275390625, "learning_rate": 0.0001, "loss": 8.1034, "loss/crossentropy": 2.246406674385071, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26154619455337524, "step": 1892 }, { "epoch": 0.118375, "grad_norm": 3.1875, "grad_norm_var": 0.07519429524739583, "learning_rate": 0.0001, "loss": 8.2607, "loss/crossentropy": 2.3726965188980103, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2482641637325287, "step": 1894 }, { "epoch": 0.1185, "grad_norm": 2.921875, "grad_norm_var": 0.06081441243489583, "learning_rate": 0.0001, "loss": 8.0314, "loss/crossentropy": 2.2549182176589966, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2727925777435303, "step": 1896 }, { "epoch": 0.118625, "grad_norm": 2.609375, "grad_norm_var": 0.06313374837239584, "learning_rate": 0.0001, "loss": 8.2286, "loss/crossentropy": 2.471170663833618, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27910932898521423, "step": 1898 }, { "epoch": 0.11875, "grad_norm": 2.859375, "grad_norm_var": 0.05249735514322917, "learning_rate": 0.0001, "loss": 8.0647, "loss/crossentropy": 2.6244795322418213, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28073398768901825, "step": 1900 }, { "epoch": 0.118875, "grad_norm": 2.71875, "grad_norm_var": 0.049128214518229164, "learning_rate": 0.0001, "loss": 7.8933, "loss/crossentropy": 2.3852927684783936, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26651833206415176, "step": 1902 }, { "epoch": 0.119, "grad_norm": 2.703125, "grad_norm_var": 0.035521443684895834, "learning_rate": 0.0001, "loss": 8.0328, "loss/crossentropy": 2.5933183431625366, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.271125927567482, "step": 1904 }, { "epoch": 0.119125, "grad_norm": 3.4375, "grad_norm_var": 0.05452372233072917, "learning_rate": 0.0001, "loss": 8.0956, "loss/crossentropy": 2.2415446043014526, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.258744515478611, "step": 1906 }, { "epoch": 0.11925, "grad_norm": 2.390625, "grad_norm_var": 0.06295166015625, "learning_rate": 0.0001, "loss": 7.882, "loss/crossentropy": 2.1915100812911987, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26428738236427307, "step": 1908 }, { "epoch": 0.119375, "grad_norm": 2.9375, "grad_norm_var": 0.059300740559895836, "learning_rate": 0.0001, "loss": 7.8918, "loss/crossentropy": 2.024084210395813, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23683403432369232, "step": 1910 }, { "epoch": 0.1195, "grad_norm": 2.546875, "grad_norm_var": 0.08255208333333333, "learning_rate": 0.0001, "loss": 8.1278, "loss/crossentropy": 2.103874683380127, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2549164593219757, "step": 1912 }, { "epoch": 0.119625, "grad_norm": 2.859375, "grad_norm_var": 0.0891021728515625, "learning_rate": 0.0001, "loss": 8.0439, "loss/crossentropy": 2.1909857988357544, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25458941608667374, "step": 1914 }, { "epoch": 0.11975, "grad_norm": 2.734375, "grad_norm_var": 0.0890625, "learning_rate": 0.0001, "loss": 8.0189, "loss/crossentropy": 2.3163031339645386, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26227420568466187, "step": 1916 }, { "epoch": 0.119875, "grad_norm": 2.984375, "grad_norm_var": 0.0921539306640625, "learning_rate": 0.0001, "loss": 8.1534, "loss/crossentropy": 2.4677486419677734, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2759769409894943, "step": 1918 }, { "epoch": 0.12, "grad_norm": 2.859375, "grad_norm_var": 0.09339192708333334, "learning_rate": 0.0001, "loss": 8.004, "loss/crossentropy": 2.3451250791549683, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24791867285966873, "step": 1920 }, { "epoch": 0.120125, "grad_norm": 2.53125, "grad_norm_var": 0.06575113932291667, "learning_rate": 0.0001, "loss": 7.9536, "loss/crossentropy": 2.39498770236969, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26757124066352844, "step": 1922 }, { "epoch": 0.12025, "grad_norm": 2.734375, "grad_norm_var": 0.059912109375, "learning_rate": 0.0001, "loss": 7.9078, "loss/crossentropy": 2.137160062789917, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23972496390342712, "step": 1924 }, { "epoch": 0.120375, "grad_norm": 3.203125, "grad_norm_var": 0.06774800618489583, "learning_rate": 0.0001, "loss": 7.9543, "loss/crossentropy": 2.354183554649353, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2630910202860832, "step": 1926 }, { "epoch": 0.1205, "grad_norm": 2.796875, "grad_norm_var": 0.04582926432291667, "learning_rate": 0.0001, "loss": 7.9081, "loss/crossentropy": 2.0638818740844727, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2652597352862358, "step": 1928 }, { "epoch": 0.120625, "grad_norm": 3.203125, "grad_norm_var": 0.052534993489583334, "learning_rate": 0.0001, "loss": 8.0497, "loss/crossentropy": 2.588783383369446, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2830745279788971, "step": 1930 }, { "epoch": 0.12075, "grad_norm": 2.671875, "grad_norm_var": 0.05359700520833333, "learning_rate": 0.0001, "loss": 8.0327, "loss/crossentropy": 2.223568558692932, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25885971635580063, "step": 1932 }, { "epoch": 0.120875, "grad_norm": 2.78125, "grad_norm_var": 0.049641927083333336, "learning_rate": 0.0001, "loss": 7.9963, "loss/crossentropy": 2.4340078830718994, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2645603120326996, "step": 1934 }, { "epoch": 0.121, "grad_norm": 2.84375, "grad_norm_var": 0.048046875, "learning_rate": 0.0001, "loss": 7.7813, "loss/crossentropy": 1.9766615629196167, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22679174691438675, "step": 1936 }, { "epoch": 0.121125, "grad_norm": 2.796875, "grad_norm_var": 0.043782552083333336, "learning_rate": 0.0001, "loss": 8.0526, "loss/crossentropy": 2.007621169090271, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.232595793902874, "step": 1938 }, { "epoch": 0.12125, "grad_norm": 2.875, "grad_norm_var": 0.03530171712239583, "learning_rate": 0.0001, "loss": 8.0147, "loss/crossentropy": 2.245633602142334, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2521408647298813, "step": 1940 }, { "epoch": 0.121375, "grad_norm": 2.859375, "grad_norm_var": 0.05499674479166667, "learning_rate": 0.0001, "loss": 8.1478, "loss/crossentropy": 2.239235758781433, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29093019664287567, "step": 1942 }, { "epoch": 0.1215, "grad_norm": 3.125, "grad_norm_var": 0.05185139973958333, "learning_rate": 0.0001, "loss": 8.0329, "loss/crossentropy": 2.219251275062561, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24115260690450668, "step": 1944 }, { "epoch": 0.121625, "grad_norm": 3.171875, "grad_norm_var": 0.079296875, "learning_rate": 0.0001, "loss": 8.2787, "loss/crossentropy": 2.3880057334899902, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2628382295370102, "step": 1946 }, { "epoch": 0.12175, "grad_norm": 2.640625, "grad_norm_var": 0.07984619140625, "learning_rate": 0.0001, "loss": 8.0593, "loss/crossentropy": 1.9636898040771484, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22413796931505203, "step": 1948 }, { "epoch": 0.121875, "grad_norm": 3.09375, "grad_norm_var": 0.07724202473958333, "learning_rate": 0.0001, "loss": 8.2408, "loss/crossentropy": 2.4159456491470337, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2786535918712616, "step": 1950 }, { "epoch": 0.122, "grad_norm": 2.9375, "grad_norm_var": 0.06634114583333334, "learning_rate": 0.0001, "loss": 8.0623, "loss/crossentropy": 2.125056028366089, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2584911435842514, "step": 1952 }, { "epoch": 0.122125, "grad_norm": 2.8125, "grad_norm_var": 0.07579752604166666, "learning_rate": 0.0001, "loss": 8.2316, "loss/crossentropy": 2.15469229221344, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24431538581848145, "step": 1954 }, { "epoch": 0.12225, "grad_norm": 2.828125, "grad_norm_var": 0.0720855712890625, "learning_rate": 0.0001, "loss": 7.9661, "loss/crossentropy": 2.124357581138611, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25534383952617645, "step": 1956 }, { "epoch": 0.122375, "grad_norm": 2.6875, "grad_norm_var": 0.058756510416666664, "learning_rate": 0.0001, "loss": 8.016, "loss/crossentropy": 2.344644784927368, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26001378893852234, "step": 1958 }, { "epoch": 0.1225, "grad_norm": 2.78125, "grad_norm_var": 0.059130859375, "learning_rate": 0.0001, "loss": 8.0888, "loss/crossentropy": 2.242557406425476, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25998418033123016, "step": 1960 }, { "epoch": 0.122625, "grad_norm": 2.578125, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 7.9998, "loss/crossentropy": 2.1519815921783447, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24088148772716522, "step": 1962 }, { "epoch": 0.12275, "grad_norm": 3.09375, "grad_norm_var": 0.0291015625, "learning_rate": 0.0001, "loss": 7.9661, "loss/crossentropy": 2.0413911938667297, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2583937346935272, "step": 1964 }, { "epoch": 0.122875, "grad_norm": 2.5625, "grad_norm_var": 0.04091695149739583, "learning_rate": 0.0001, "loss": 8.0222, "loss/crossentropy": 2.404345154762268, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23985719680786133, "step": 1966 }, { "epoch": 0.123, "grad_norm": 2.71875, "grad_norm_var": 0.044709269205729166, "learning_rate": 0.0001, "loss": 8.0141, "loss/crossentropy": 2.6169755458831787, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2697141170501709, "step": 1968 }, { "epoch": 0.123125, "grad_norm": 2.90625, "grad_norm_var": 0.045832316080729164, "learning_rate": 0.0001, "loss": 8.0295, "loss/crossentropy": 2.2693088054656982, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2512510120868683, "step": 1970 }, { "epoch": 0.12325, "grad_norm": 2.578125, "grad_norm_var": 0.050699869791666664, "learning_rate": 0.0001, "loss": 7.9091, "loss/crossentropy": 2.4330859184265137, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2577967271208763, "step": 1972 }, { "epoch": 0.123375, "grad_norm": 3.046875, "grad_norm_var": 0.052490234375, "learning_rate": 0.0001, "loss": 8.1396, "loss/crossentropy": 2.560065507888794, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2873089164495468, "step": 1974 }, { "epoch": 0.1235, "grad_norm": 3.1875, "grad_norm_var": 0.060155232747395836, "learning_rate": 0.0001, "loss": 7.8746, "loss/crossentropy": 2.174700140953064, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2541915774345398, "step": 1976 }, { "epoch": 0.123625, "grad_norm": 2.765625, "grad_norm_var": 0.05668843587239583, "learning_rate": 0.0001, "loss": 7.947, "loss/crossentropy": 2.164485454559326, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25544650852680206, "step": 1978 }, { "epoch": 0.12375, "grad_norm": 3.21875, "grad_norm_var": 0.06304423014322917, "learning_rate": 0.0001, "loss": 8.1361, "loss/crossentropy": 2.110231041908264, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2481146827340126, "step": 1980 }, { "epoch": 0.123875, "grad_norm": 2.78125, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 8.0873, "loss/crossentropy": 2.4308364391326904, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2560970336198807, "step": 1982 }, { "epoch": 0.124, "grad_norm": 2.828125, "grad_norm_var": 0.0412750244140625, "learning_rate": 0.0001, "loss": 8.0323, "loss/crossentropy": 2.481392025947571, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2720007449388504, "step": 1984 }, { "epoch": 0.124125, "grad_norm": 2.9375, "grad_norm_var": 0.041975911458333334, "learning_rate": 0.0001, "loss": 7.857, "loss/crossentropy": 2.1386367082595825, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26166096329689026, "step": 1986 }, { "epoch": 0.12425, "grad_norm": 2.6875, "grad_norm_var": 0.04220377604166667, "learning_rate": 0.0001, "loss": 8.0093, "loss/crossentropy": 2.333972215652466, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25886698067188263, "step": 1988 }, { "epoch": 0.124375, "grad_norm": 3.265625, "grad_norm_var": 0.05032145182291667, "learning_rate": 0.0001, "loss": 8.0834, "loss/crossentropy": 2.41828191280365, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24262161552906036, "step": 1990 }, { "epoch": 0.1245, "grad_norm": 2.53125, "grad_norm_var": 0.046052042643229166, "learning_rate": 0.0001, "loss": 8.0389, "loss/crossentropy": 2.1740458011627197, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.23383785039186478, "step": 1992 }, { "epoch": 0.124625, "grad_norm": 2.5625, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 8.0157, "loss/crossentropy": 2.2331719398498535, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2640424221754074, "step": 1994 }, { "epoch": 0.12475, "grad_norm": 3.015625, "grad_norm_var": 0.04023335774739583, "learning_rate": 0.0001, "loss": 7.9991, "loss/crossentropy": 2.1523889303207397, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2687607556581497, "step": 1996 }, { "epoch": 0.124875, "grad_norm": 2.859375, "grad_norm_var": 0.040526326497395834, "learning_rate": 0.0001, "loss": 7.9342, "loss/crossentropy": 2.171301484107971, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24432216584682465, "step": 1998 }, { "epoch": 0.125, "grad_norm": 2.78125, "grad_norm_var": 0.04273681640625, "learning_rate": 0.0001, "loss": 8.1111, "loss/crossentropy": 2.2224671840667725, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23374950140714645, "step": 2000 }, { "epoch": 0.125125, "grad_norm": 2.5, "grad_norm_var": 0.04103902180989583, "learning_rate": 0.0001, "loss": 8.0055, "loss/crossentropy": 2.1209537386894226, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2525208741426468, "step": 2002 }, { "epoch": 0.12525, "grad_norm": 2.5625, "grad_norm_var": 0.04299723307291667, "learning_rate": 0.0001, "loss": 7.8668, "loss/crossentropy": 2.1079421639442444, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25046999752521515, "step": 2004 }, { "epoch": 0.125375, "grad_norm": 3.078125, "grad_norm_var": 0.03804931640625, "learning_rate": 0.0001, "loss": 7.9843, "loss/crossentropy": 2.3907183408737183, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2595418617129326, "step": 2006 }, { "epoch": 0.1255, "grad_norm": 2.78125, "grad_norm_var": 0.0336578369140625, "learning_rate": 0.0001, "loss": 7.9723, "loss/crossentropy": 2.081270694732666, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25873951613903046, "step": 2008 }, { "epoch": 0.125625, "grad_norm": 3.046875, "grad_norm_var": 0.033675130208333334, "learning_rate": 0.0001, "loss": 8.0587, "loss/crossentropy": 2.204562723636627, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24240562319755554, "step": 2010 }, { "epoch": 0.12575, "grad_norm": 2.6875, "grad_norm_var": 0.030549112955729166, "learning_rate": 0.0001, "loss": 8.0825, "loss/crossentropy": 2.234739661216736, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23210185766220093, "step": 2012 }, { "epoch": 0.125875, "grad_norm": 2.984375, "grad_norm_var": 0.03386128743489583, "learning_rate": 0.0001, "loss": 7.9408, "loss/crossentropy": 2.155194342136383, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24713550508022308, "step": 2014 }, { "epoch": 0.126, "grad_norm": 2.921875, "grad_norm_var": 0.03388671875, "learning_rate": 0.0001, "loss": 8.0006, "loss/crossentropy": 2.1440590620040894, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24631793051958084, "step": 2016 }, { "epoch": 0.126125, "grad_norm": 3.375, "grad_norm_var": 0.06747945149739583, "learning_rate": 0.0001, "loss": 8.2657, "loss/crossentropy": 2.2782651782035828, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25610214471817017, "step": 2018 }, { "epoch": 0.12625, "grad_norm": 2.703125, "grad_norm_var": 0.0567535400390625, "learning_rate": 0.0001, "loss": 7.9078, "loss/crossentropy": 2.2536474466323853, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23063694685697556, "step": 2020 }, { "epoch": 0.126375, "grad_norm": 3.890625, "grad_norm_var": 0.1236724853515625, "learning_rate": 0.0001, "loss": 8.0493, "loss/crossentropy": 2.2990732192993164, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.28477251529693604, "step": 2022 }, { "epoch": 0.1265, "grad_norm": 3.15625, "grad_norm_var": 0.1252593994140625, "learning_rate": 0.0001, "loss": 8.0902, "loss/crossentropy": 2.3423362970352173, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27799367904663086, "step": 2024 }, { "epoch": 0.126625, "grad_norm": 3.359375, "grad_norm_var": 0.7259999593098958, "learning_rate": 0.0001, "loss": 8.3429, "loss/crossentropy": 2.1187247037887573, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25325731933116913, "step": 2026 }, { "epoch": 0.12675, "grad_norm": 3.328125, "grad_norm_var": 0.6796061197916666, "learning_rate": 0.0001, "loss": 8.2877, "loss/crossentropy": 2.6572694778442383, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28467129170894623, "step": 2028 }, { "epoch": 0.126875, "grad_norm": 2.953125, "grad_norm_var": 0.6576171875, "learning_rate": 0.0001, "loss": 8.1366, "loss/crossentropy": 2.149785280227661, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.33148565888404846, "step": 2030 }, { "epoch": 0.127, "grad_norm": 2.984375, "grad_norm_var": 0.6441243489583334, "learning_rate": 0.0001, "loss": 8.016, "loss/crossentropy": 1.9550745487213135, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.21983042359352112, "step": 2032 }, { "epoch": 0.127125, "grad_norm": 2.59375, "grad_norm_var": 0.70074462890625, "learning_rate": 0.0001, "loss": 7.9985, "loss/crossentropy": 2.4743508100509644, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2646654099225998, "step": 2034 }, { "epoch": 0.12725, "grad_norm": 2.953125, "grad_norm_var": 0.679931640625, "learning_rate": 0.0001, "loss": 7.7714, "loss/crossentropy": 2.2703075408935547, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24541212618350983, "step": 2036 }, { "epoch": 0.127375, "grad_norm": 2.875, "grad_norm_var": 0.66455078125, "learning_rate": 0.0001, "loss": 8.2056, "loss/crossentropy": 2.400794506072998, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2792155146598816, "step": 2038 }, { "epoch": 0.1275, "grad_norm": 2.703125, "grad_norm_var": 0.7059529622395834, "learning_rate": 0.0001, "loss": 7.9121, "loss/crossentropy": 2.221606969833374, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27579738199710846, "step": 2040 }, { "epoch": 0.127625, "grad_norm": 2.875, "grad_norm_var": 0.05742085774739583, "learning_rate": 0.0001, "loss": 8.0984, "loss/crossentropy": 2.1297446489334106, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2457616627216339, "step": 2042 }, { "epoch": 0.12775, "grad_norm": 6.28125, "grad_norm_var": 0.7566802978515625, "learning_rate": 0.0001, "loss": 8.1542, "loss/crossentropy": 2.068065047264099, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2585330307483673, "step": 2044 }, { "epoch": 0.127875, "grad_norm": 3.515625, "grad_norm_var": 0.7755767822265625, "learning_rate": 0.0001, "loss": 8.2794, "loss/crossentropy": 2.285371780395508, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26811927556991577, "step": 2046 }, { "epoch": 0.128, "grad_norm": 3.015625, "grad_norm_var": 0.7772420247395834, "learning_rate": 0.0001, "loss": 8.0508, "loss/crossentropy": 2.102661430835724, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24708286672830582, "step": 2048 }, { "epoch": 0.128125, "grad_norm": 2.78125, "grad_norm_var": 0.7524648030598958, "learning_rate": 0.0001, "loss": 7.87, "loss/crossentropy": 1.9193878173828125, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24057899415493011, "step": 2050 }, { "epoch": 0.12825, "grad_norm": 2.828125, "grad_norm_var": 0.7676096598307292, "learning_rate": 0.0001, "loss": 7.9655, "loss/crossentropy": 2.2599531412124634, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2473764270544052, "step": 2052 }, { "epoch": 0.128375, "grad_norm": 3.03125, "grad_norm_var": 0.7643717447916667, "learning_rate": 0.0001, "loss": 8.0443, "loss/crossentropy": 2.1840824484825134, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.21873797476291656, "step": 2054 }, { "epoch": 0.1285, "grad_norm": 2.515625, "grad_norm_var": 0.7554026285807292, "learning_rate": 0.0001, "loss": 7.8934, "loss/crossentropy": 2.5172749757766724, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2471792846918106, "step": 2056 }, { "epoch": 0.128625, "grad_norm": 2.71875, "grad_norm_var": 0.7780436197916667, "learning_rate": 0.0001, "loss": 7.9608, "loss/crossentropy": 2.5356470346450806, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27692335844039917, "step": 2058 }, { "epoch": 0.12875, "grad_norm": 3.03125, "grad_norm_var": 0.08212483723958333, "learning_rate": 0.0001, "loss": 8.1766, "loss/crossentropy": 2.3050994873046875, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27711644768714905, "step": 2060 }, { "epoch": 0.128875, "grad_norm": 2.78125, "grad_norm_var": 0.031245930989583334, "learning_rate": 0.0001, "loss": 8.0858, "loss/crossentropy": 2.037451386451721, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.234269917011261, "step": 2062 }, { "epoch": 0.129, "grad_norm": 2.609375, "grad_norm_var": 0.035456339518229164, "learning_rate": 0.0001, "loss": 7.9019, "loss/crossentropy": 2.333581566810608, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2544917017221451, "step": 2064 }, { "epoch": 0.129125, "grad_norm": 2.71875, "grad_norm_var": 0.031248982747395834, "learning_rate": 0.0001, "loss": 8.1845, "loss/crossentropy": 2.3705108165740967, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2595665156841278, "step": 2066 }, { "epoch": 0.12925, "grad_norm": 2.96875, "grad_norm_var": 0.03257548014322917, "learning_rate": 0.0001, "loss": 8.0113, "loss/crossentropy": 2.2181931734085083, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2753777801990509, "step": 2068 }, { "epoch": 0.129375, "grad_norm": 2.6875, "grad_norm_var": 0.028678385416666667, "learning_rate": 0.0001, "loss": 8.046, "loss/crossentropy": 2.4384394884109497, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.260432630777359, "step": 2070 }, { "epoch": 0.1295, "grad_norm": 3.03125, "grad_norm_var": 0.022932942708333334, "learning_rate": 0.0001, "loss": 8.1072, "loss/crossentropy": 2.2950222492218018, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2557060271501541, "step": 2072 }, { "epoch": 0.129625, "grad_norm": 2.796875, "grad_norm_var": 0.0248443603515625, "learning_rate": 0.0001, "loss": 7.8764, "loss/crossentropy": 1.9588146209716797, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2530653849244118, "step": 2074 }, { "epoch": 0.12975, "grad_norm": 2.859375, "grad_norm_var": 0.020759073893229167, "learning_rate": 0.0001, "loss": 8.1442, "loss/crossentropy": 2.174315929412842, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24201688915491104, "step": 2076 }, { "epoch": 0.129875, "grad_norm": 2.875, "grad_norm_var": 0.026642862955729166, "learning_rate": 0.0001, "loss": 7.8332, "loss/crossentropy": 1.8437206149101257, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2369949370622635, "step": 2078 }, { "epoch": 0.13, "grad_norm": 2.765625, "grad_norm_var": 0.0224761962890625, "learning_rate": 0.0001, "loss": 8.1136, "loss/crossentropy": 2.4198756217956543, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2665669322013855, "step": 2080 }, { "epoch": 0.130125, "grad_norm": 2.765625, "grad_norm_var": 0.030143229166666667, "learning_rate": 0.0001, "loss": 7.8881, "loss/crossentropy": 2.051876664161682, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23426489531993866, "step": 2082 }, { "epoch": 0.13025, "grad_norm": 2.59375, "grad_norm_var": 0.031473795572916664, "learning_rate": 0.0001, "loss": 8.0093, "loss/crossentropy": 2.455062747001648, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2640880271792412, "step": 2084 }, { "epoch": 0.130375, "grad_norm": 2.875, "grad_norm_var": 0.03137613932291667, "learning_rate": 0.0001, "loss": 8.0219, "loss/crossentropy": 2.2999762296676636, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2465459704399109, "step": 2086 }, { "epoch": 0.1305, "grad_norm": 2.828125, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 8.0045, "loss/crossentropy": 2.2871525287628174, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24762800335884094, "step": 2088 }, { "epoch": 0.130625, "grad_norm": 2.953125, "grad_norm_var": 0.03261617024739583, "learning_rate": 0.0001, "loss": 7.8668, "loss/crossentropy": 2.3214457035064697, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25224703550338745, "step": 2090 }, { "epoch": 0.13075, "grad_norm": 2.390625, "grad_norm_var": 0.042313639322916666, "learning_rate": 0.0001, "loss": 8.0465, "loss/crossentropy": 2.1685001850128174, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2591954469680786, "step": 2092 }, { "epoch": 0.130875, "grad_norm": 3.078125, "grad_norm_var": 0.041136678059895834, "learning_rate": 0.0001, "loss": 8.0284, "loss/crossentropy": 2.5781397819519043, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26295357197523117, "step": 2094 }, { "epoch": 0.131, "grad_norm": 2.5, "grad_norm_var": 0.045466105143229164, "learning_rate": 0.0001, "loss": 7.8661, "loss/crossentropy": 2.1926894187927246, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23267576098442078, "step": 2096 }, { "epoch": 0.131125, "grad_norm": 3.1875, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 7.9553, "loss/crossentropy": 2.1911760568618774, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25542213022708893, "step": 2098 }, { "epoch": 0.13125, "grad_norm": 2.78125, "grad_norm_var": 0.04755452473958333, "learning_rate": 0.0001, "loss": 7.958, "loss/crossentropy": 2.2775418758392334, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24572113156318665, "step": 2100 }, { "epoch": 0.131375, "grad_norm": 2.78125, "grad_norm_var": 0.04748942057291667, "learning_rate": 0.0001, "loss": 8.0769, "loss/crossentropy": 2.3823719024658203, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2616809457540512, "step": 2102 }, { "epoch": 0.1315, "grad_norm": 2.90625, "grad_norm_var": 0.047240193684895834, "learning_rate": 0.0001, "loss": 8.0382, "loss/crossentropy": 2.4556522369384766, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2946828603744507, "step": 2104 }, { "epoch": 0.131625, "grad_norm": 3.0, "grad_norm_var": 0.05573628743489583, "learning_rate": 0.0001, "loss": 8.0877, "loss/crossentropy": 2.195865511894226, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26489073038101196, "step": 2106 }, { "epoch": 0.13175, "grad_norm": 2.375, "grad_norm_var": 0.05614827473958333, "learning_rate": 0.0001, "loss": 7.9057, "loss/crossentropy": 2.373032331466675, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24483656883239746, "step": 2108 }, { "epoch": 0.131875, "grad_norm": 2.765625, "grad_norm_var": 0.055497233072916666, "learning_rate": 0.0001, "loss": 8.0201, "loss/crossentropy": 2.4894858598709106, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2641270160675049, "step": 2110 }, { "epoch": 0.132, "grad_norm": 2.71875, "grad_norm_var": 0.08896484375, "learning_rate": 0.0001, "loss": 7.9225, "loss/crossentropy": 2.312375068664551, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24002444744110107, "step": 2112 }, { "epoch": 0.132125, "grad_norm": 2.765625, "grad_norm_var": 0.0932037353515625, "learning_rate": 0.0001, "loss": 7.9214, "loss/crossentropy": 2.000899076461792, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23694107681512833, "step": 2114 }, { "epoch": 0.13225, "grad_norm": 2.9375, "grad_norm_var": 0.09410807291666666, "learning_rate": 0.0001, "loss": 8.0863, "loss/crossentropy": 2.2505098581314087, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2779449298977852, "step": 2116 }, { "epoch": 0.132375, "grad_norm": 2.90625, "grad_norm_var": 0.0931549072265625, "learning_rate": 0.0001, "loss": 8.2278, "loss/crossentropy": 2.533925771713257, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27186477184295654, "step": 2118 }, { "epoch": 0.1325, "grad_norm": 3.0625, "grad_norm_var": 0.16379292805989584, "learning_rate": 0.0001, "loss": 7.9833, "loss/crossentropy": 2.2135390043258667, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2325379028916359, "step": 2120 }, { "epoch": 0.132625, "grad_norm": 3.03125, "grad_norm_var": 0.1632232666015625, "learning_rate": 0.0001, "loss": 8.0453, "loss/crossentropy": 2.428161382675171, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26223746687173843, "step": 2122 }, { "epoch": 0.13275, "grad_norm": 2.5625, "grad_norm_var": 0.1512603759765625, "learning_rate": 0.0001, "loss": 7.9349, "loss/crossentropy": 2.6289626359939575, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.272312268614769, "step": 2124 }, { "epoch": 0.132875, "grad_norm": 3.015625, "grad_norm_var": 0.15465087890625, "learning_rate": 0.0001, "loss": 8.0484, "loss/crossentropy": 2.081725239753723, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25036202371120453, "step": 2126 }, { "epoch": 0.133, "grad_norm": 3.109375, "grad_norm_var": 0.12092692057291667, "learning_rate": 0.0001, "loss": 8.1453, "loss/crossentropy": 2.4221293926239014, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2642197906970978, "step": 2128 }, { "epoch": 0.133125, "grad_norm": 2.609375, "grad_norm_var": 0.11172587076822917, "learning_rate": 0.0001, "loss": 8.0771, "loss/crossentropy": 2.385019063949585, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2672020420432091, "step": 2130 }, { "epoch": 0.13325, "grad_norm": 3.046875, "grad_norm_var": 0.10976155598958333, "learning_rate": 0.0001, "loss": 7.9391, "loss/crossentropy": 2.356515049934387, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23601362109184265, "step": 2132 }, { "epoch": 0.133375, "grad_norm": 2.796875, "grad_norm_var": 0.11910400390625, "learning_rate": 0.0001, "loss": 7.8618, "loss/crossentropy": 2.4709160327911377, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26310209929943085, "step": 2134 }, { "epoch": 0.1335, "grad_norm": 2.625, "grad_norm_var": 0.04101155598958333, "learning_rate": 0.0001, "loss": 7.9999, "loss/crossentropy": 2.4431287050247192, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25448351353406906, "step": 2136 }, { "epoch": 0.133625, "grad_norm": 2.734375, "grad_norm_var": 0.0377838134765625, "learning_rate": 0.0001, "loss": 8.0923, "loss/crossentropy": 2.361445426940918, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24513405561447144, "step": 2138 }, { "epoch": 0.13375, "grad_norm": 2.84375, "grad_norm_var": 0.03205464680989583, "learning_rate": 0.0001, "loss": 8.0121, "loss/crossentropy": 2.400641083717346, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2581202983856201, "step": 2140 }, { "epoch": 0.133875, "grad_norm": 2.390625, "grad_norm_var": 0.0412506103515625, "learning_rate": 0.0001, "loss": 7.8979, "loss/crossentropy": 2.0805707573890686, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2818114757537842, "step": 2142 }, { "epoch": 0.134, "grad_norm": 3.375, "grad_norm_var": 0.05451558430989583, "learning_rate": 0.0001, "loss": 8.1184, "loss/crossentropy": 2.480680823326111, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.29633618891239166, "step": 2144 }, { "epoch": 0.134125, "grad_norm": 2.625, "grad_norm_var": 0.0557037353515625, "learning_rate": 0.0001, "loss": 7.8102, "loss/crossentropy": 2.002712309360504, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.23709578067064285, "step": 2146 }, { "epoch": 0.13425, "grad_norm": 2.609375, "grad_norm_var": 0.05565999348958333, "learning_rate": 0.0001, "loss": 7.8972, "loss/crossentropy": 2.405007004737854, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2549060881137848, "step": 2148 }, { "epoch": 0.134375, "grad_norm": 2.6875, "grad_norm_var": 0.05328369140625, "learning_rate": 0.0001, "loss": 7.9409, "loss/crossentropy": 2.137619376182556, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2625036686658859, "step": 2150 }, { "epoch": 0.1345, "grad_norm": 2.75, "grad_norm_var": 0.04928385416666667, "learning_rate": 0.0001, "loss": 7.983, "loss/crossentropy": 2.3216036558151245, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26807793229818344, "step": 2152 }, { "epoch": 0.134625, "grad_norm": 3.265625, "grad_norm_var": 0.06470947265625, "learning_rate": 0.0001, "loss": 7.8514, "loss/crossentropy": 2.3814263343811035, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2562423348426819, "step": 2154 }, { "epoch": 0.13475, "grad_norm": 2.75, "grad_norm_var": 0.07095947265625, "learning_rate": 0.0001, "loss": 8.0623, "loss/crossentropy": 2.3068708181381226, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2527267262339592, "step": 2156 }, { "epoch": 0.134875, "grad_norm": 2.609375, "grad_norm_var": 0.05712483723958333, "learning_rate": 0.0001, "loss": 8.1027, "loss/crossentropy": 2.242267608642578, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2507360577583313, "step": 2158 }, { "epoch": 0.135, "grad_norm": 2.671875, "grad_norm_var": 0.04262593587239583, "learning_rate": 0.0001, "loss": 7.7535, "loss/crossentropy": 2.259010672569275, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2302761897444725, "step": 2160 }, { "epoch": 0.135125, "grad_norm": 2.90625, "grad_norm_var": 0.04046223958333333, "learning_rate": 0.0001, "loss": 7.8616, "loss/crossentropy": 2.195171058177948, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23362931609153748, "step": 2162 }, { "epoch": 0.13525, "grad_norm": 2.765625, "grad_norm_var": 0.03961588541666667, "learning_rate": 0.0001, "loss": 7.9037, "loss/crossentropy": 2.306097149848938, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.27228541672229767, "step": 2164 }, { "epoch": 0.135375, "grad_norm": 2.75, "grad_norm_var": 0.039937337239583336, "learning_rate": 0.0001, "loss": 7.9223, "loss/crossentropy": 2.354483962059021, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25157542526721954, "step": 2166 }, { "epoch": 0.1355, "grad_norm": 2.953125, "grad_norm_var": 0.04169514973958333, "learning_rate": 0.0001, "loss": 8.0153, "loss/crossentropy": 2.555723190307617, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2608166038990021, "step": 2168 }, { "epoch": 0.135625, "grad_norm": 2.59375, "grad_norm_var": 0.026976521809895834, "learning_rate": 0.0001, "loss": 8.0232, "loss/crossentropy": 2.314175248146057, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2557505890727043, "step": 2170 }, { "epoch": 0.13575, "grad_norm": 2.640625, "grad_norm_var": 0.0195709228515625, "learning_rate": 0.0001, "loss": 7.9036, "loss/crossentropy": 2.252376437187195, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26094751060009, "step": 2172 }, { "epoch": 0.135875, "grad_norm": 2.578125, "grad_norm_var": 0.020182291666666668, "learning_rate": 0.0001, "loss": 7.7816, "loss/crossentropy": 2.0808385610580444, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25295551121234894, "step": 2174 }, { "epoch": 0.136, "grad_norm": 2.703125, "grad_norm_var": 0.016145833333333335, "learning_rate": 0.0001, "loss": 7.9248, "loss/crossentropy": 2.2166486978530884, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2506643235683441, "step": 2176 }, { "epoch": 0.136125, "grad_norm": 2.734375, "grad_norm_var": 0.0152496337890625, "learning_rate": 0.0001, "loss": 7.9413, "loss/crossentropy": 2.4114983081817627, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24629026651382446, "step": 2178 }, { "epoch": 0.13625, "grad_norm": 2.984375, "grad_norm_var": 0.017508951822916667, "learning_rate": 0.0001, "loss": 7.9527, "loss/crossentropy": 1.9971612095832825, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23709237575531006, "step": 2180 }, { "epoch": 0.136375, "grad_norm": 2.734375, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 8.0183, "loss/crossentropy": 2.598210096359253, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2603771388530731, "step": 2182 }, { "epoch": 0.1365, "grad_norm": 2.9375, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 8.0132, "loss/crossentropy": 2.382105231285095, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24392583966255188, "step": 2184 }, { "epoch": 0.136625, "grad_norm": 2.59375, "grad_norm_var": 0.022834269205729167, "learning_rate": 0.0001, "loss": 8.047, "loss/crossentropy": 2.4047285318374634, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27039480209350586, "step": 2186 }, { "epoch": 0.13675, "grad_norm": 2.4375, "grad_norm_var": 0.032938639322916664, "learning_rate": 0.0001, "loss": 7.7846, "loss/crossentropy": 2.0133553743362427, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.242934912443161, "step": 2188 }, { "epoch": 0.136875, "grad_norm": 3.078125, "grad_norm_var": 0.043701171875, "learning_rate": 0.0001, "loss": 8.1042, "loss/crossentropy": 2.4624531269073486, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26904913038015366, "step": 2190 }, { "epoch": 0.137, "grad_norm": 2.640625, "grad_norm_var": 0.048216756184895834, "learning_rate": 0.0001, "loss": 7.9589, "loss/crossentropy": 2.3441646099090576, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2756097614765167, "step": 2192 }, { "epoch": 0.137125, "grad_norm": 2.609375, "grad_norm_var": 0.0473297119140625, "learning_rate": 0.0001, "loss": 7.9626, "loss/crossentropy": 2.0180709958076477, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22330156713724136, "step": 2194 }, { "epoch": 0.13725, "grad_norm": 4.875, "grad_norm_var": 0.3212076822916667, "learning_rate": 0.0001, "loss": 8.1305, "loss/crossentropy": 2.247686982154846, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.257401205599308, "step": 2196 }, { "epoch": 0.137375, "grad_norm": 3.125, "grad_norm_var": 0.3214182535807292, "learning_rate": 0.0001, "loss": 7.8432, "loss/crossentropy": 2.387032985687256, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2718297243118286, "step": 2198 }, { "epoch": 0.1375, "grad_norm": 2.84375, "grad_norm_var": 0.3174957275390625, "learning_rate": 0.0001, "loss": 7.9656, "loss/crossentropy": 2.1113094091415405, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2628230005502701, "step": 2200 }, { "epoch": 0.137625, "grad_norm": 2.84375, "grad_norm_var": 0.3123931884765625, "learning_rate": 0.0001, "loss": 8.1272, "loss/crossentropy": 2.681854248046875, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.30167150497436523, "step": 2202 }, { "epoch": 0.13775, "grad_norm": 2.71875, "grad_norm_var": 0.28544514973958335, "learning_rate": 0.0001, "loss": 7.8842, "loss/crossentropy": 2.2539913654327393, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2667968273162842, "step": 2204 }, { "epoch": 0.137875, "grad_norm": 2.984375, "grad_norm_var": 0.28813374837239586, "learning_rate": 0.0001, "loss": 7.994, "loss/crossentropy": 2.336976170539856, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2730831503868103, "step": 2206 }, { "epoch": 0.138, "grad_norm": 2.90625, "grad_norm_var": 0.28128153483072915, "learning_rate": 0.0001, "loss": 7.8908, "loss/crossentropy": 2.137080729007721, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2353602722287178, "step": 2208 }, { "epoch": 0.138125, "grad_norm": 2.5, "grad_norm_var": 0.28933919270833336, "learning_rate": 0.0001, "loss": 7.8058, "loss/crossentropy": 2.0887175798416138, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2580728679895401, "step": 2210 }, { "epoch": 0.13825, "grad_norm": 2.640625, "grad_norm_var": 0.030269368489583334, "learning_rate": 0.0001, "loss": 8.0849, "loss/crossentropy": 1.7958271503448486, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24919769912958145, "step": 2212 }, { "epoch": 0.138375, "grad_norm": 2.640625, "grad_norm_var": 0.0386871337890625, "learning_rate": 0.0001, "loss": 7.8756, "loss/crossentropy": 2.135870099067688, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24682459235191345, "step": 2214 }, { "epoch": 0.1385, "grad_norm": 2.734375, "grad_norm_var": 0.08369038899739584, "learning_rate": 0.0001, "loss": 8.0304, "loss/crossentropy": 2.346623420715332, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24961213767528534, "step": 2216 }, { "epoch": 0.138625, "grad_norm": 2.703125, "grad_norm_var": 0.07737630208333333, "learning_rate": 0.0001, "loss": 8.0556, "loss/crossentropy": 2.195094585418701, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24484457075595856, "step": 2218 }, { "epoch": 0.13875, "grad_norm": 2.703125, "grad_norm_var": 0.07613525390625, "learning_rate": 0.0001, "loss": 8.0197, "loss/crossentropy": 2.2862359285354614, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2567315921187401, "step": 2220 }, { "epoch": 0.138875, "grad_norm": 2.546875, "grad_norm_var": 0.07868550618489584, "learning_rate": 0.0001, "loss": 7.9319, "loss/crossentropy": 2.331244111061096, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2514599338173866, "step": 2222 }, { "epoch": 0.139, "grad_norm": 3.15625, "grad_norm_var": 0.08726806640625, "learning_rate": 0.0001, "loss": 7.8826, "loss/crossentropy": 1.9546465873718262, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23689769953489304, "step": 2224 }, { "epoch": 0.139125, "grad_norm": 2.3125, "grad_norm_var": 0.09897359212239583, "learning_rate": 0.0001, "loss": 7.859, "loss/crossentropy": 2.0505433082580566, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24545447528362274, "step": 2226 }, { "epoch": 0.13925, "grad_norm": 2.828125, "grad_norm_var": 0.10256245930989584, "learning_rate": 0.0001, "loss": 7.9768, "loss/crossentropy": 2.3643672466278076, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2489551082253456, "step": 2228 }, { "epoch": 0.139375, "grad_norm": 2.96875, "grad_norm_var": 0.09179280598958334, "learning_rate": 0.0001, "loss": 7.9637, "loss/crossentropy": 2.4726024866104126, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2436269074678421, "step": 2230 }, { "epoch": 0.1395, "grad_norm": 2.765625, "grad_norm_var": 0.04537760416666667, "learning_rate": 0.0001, "loss": 8.1119, "loss/crossentropy": 2.409575581550598, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.258441299200058, "step": 2232 }, { "epoch": 0.139625, "grad_norm": 2.546875, "grad_norm_var": 0.04572652180989583, "learning_rate": 0.0001, "loss": 7.8413, "loss/crossentropy": 2.37674617767334, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26087239384651184, "step": 2234 }, { "epoch": 0.13975, "grad_norm": 2.796875, "grad_norm_var": 0.04651590983072917, "learning_rate": 0.0001, "loss": 7.9557, "loss/crossentropy": 2.270553708076477, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24740490317344666, "step": 2236 }, { "epoch": 0.139875, "grad_norm": 2.703125, "grad_norm_var": 0.04517822265625, "learning_rate": 0.0001, "loss": 7.8289, "loss/crossentropy": 2.32234787940979, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2481185868382454, "step": 2238 }, { "epoch": 0.14, "grad_norm": 2.71875, "grad_norm_var": 0.03321024576822917, "learning_rate": 0.0001, "loss": 8.0077, "loss/crossentropy": 2.4945857524871826, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2944895774126053, "step": 2240 }, { "epoch": 0.140125, "grad_norm": 2.609375, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 7.8994, "loss/crossentropy": 2.3556969165802, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2561969757080078, "step": 2242 }, { "epoch": 0.14025, "grad_norm": 2.8125, "grad_norm_var": 0.013765462239583333, "learning_rate": 0.0001, "loss": 7.9348, "loss/crossentropy": 2.219905376434326, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.273133248090744, "step": 2244 }, { "epoch": 0.140375, "grad_norm": 2.734375, "grad_norm_var": 0.00855712890625, "learning_rate": 0.0001, "loss": 8.0431, "loss/crossentropy": 2.1384077668190002, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.237950399518013, "step": 2246 }, { "epoch": 0.1405, "grad_norm": 2.8125, "grad_norm_var": 0.0127349853515625, "learning_rate": 0.0001, "loss": 7.83, "loss/crossentropy": 2.3398306369781494, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25574472546577454, "step": 2248 }, { "epoch": 0.140625, "grad_norm": 2.484375, "grad_norm_var": 0.015608723958333333, "learning_rate": 0.0001, "loss": 8.033, "loss/crossentropy": 2.24453866481781, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23751161247491837, "step": 2250 }, { "epoch": 0.14075, "grad_norm": 2.65625, "grad_norm_var": 0.014388020833333333, "learning_rate": 0.0001, "loss": 7.8561, "loss/crossentropy": 1.9904406070709229, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24353782087564468, "step": 2252 }, { "epoch": 0.140875, "grad_norm": 2.578125, "grad_norm_var": 0.017252604166666668, "learning_rate": 0.0001, "loss": 7.866, "loss/crossentropy": 2.367901563644409, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24962928891181946, "step": 2254 }, { "epoch": 0.141, "grad_norm": 2.984375, "grad_norm_var": 0.0227447509765625, "learning_rate": 0.0001, "loss": 8.0945, "loss/crossentropy": 2.3909614086151123, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2490956410765648, "step": 2256 }, { "epoch": 0.141125, "grad_norm": 2.75, "grad_norm_var": 0.0219635009765625, "learning_rate": 0.0001, "loss": 8.0642, "loss/crossentropy": 2.158316493034363, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2531931698322296, "step": 2258 }, { "epoch": 0.14125, "grad_norm": 2.671875, "grad_norm_var": 0.0224761962890625, "learning_rate": 0.0001, "loss": 7.8171, "loss/crossentropy": 2.163187623023987, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24659747630357742, "step": 2260 }, { "epoch": 0.141375, "grad_norm": 2.765625, "grad_norm_var": 0.02261962890625, "learning_rate": 0.0001, "loss": 7.8513, "loss/crossentropy": 2.2378615140914917, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.232786126434803, "step": 2262 }, { "epoch": 0.1415, "grad_norm": 2.859375, "grad_norm_var": 0.0192047119140625, "learning_rate": 0.0001, "loss": 7.9754, "loss/crossentropy": 2.50004506111145, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24566112458705902, "step": 2264 }, { "epoch": 0.141625, "grad_norm": 2.734375, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 7.9826, "loss/crossentropy": 2.192861318588257, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26260973513126373, "step": 2266 }, { "epoch": 0.14175, "grad_norm": 2.6875, "grad_norm_var": 0.0215240478515625, "learning_rate": 0.0001, "loss": 7.8327, "loss/crossentropy": 2.3900744915008545, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23667296767234802, "step": 2268 }, { "epoch": 0.141875, "grad_norm": 2.671875, "grad_norm_var": 0.0174713134765625, "learning_rate": 0.0001, "loss": 7.9866, "loss/crossentropy": 2.459189772605896, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2659824937582016, "step": 2270 }, { "epoch": 0.142, "grad_norm": 2.765625, "grad_norm_var": 0.0124176025390625, "learning_rate": 0.0001, "loss": 7.9221, "loss/crossentropy": 2.3119730949401855, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2727499008178711, "step": 2272 }, { "epoch": 0.142125, "grad_norm": 3.03125, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 8.0506, "loss/crossentropy": 2.065304398536682, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2462991625070572, "step": 2274 }, { "epoch": 0.14225, "grad_norm": 2.484375, "grad_norm_var": 0.022021484375, "learning_rate": 0.0001, "loss": 7.8896, "loss/crossentropy": 2.293634057044983, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23517119884490967, "step": 2276 }, { "epoch": 0.142375, "grad_norm": 2.640625, "grad_norm_var": 0.0225250244140625, "learning_rate": 0.0001, "loss": 7.9233, "loss/crossentropy": 2.193318486213684, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24228639900684357, "step": 2278 }, { "epoch": 0.1425, "grad_norm": 2.90625, "grad_norm_var": 0.0252593994140625, "learning_rate": 0.0001, "loss": 7.9282, "loss/crossentropy": 2.3415223360061646, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25495699793100357, "step": 2280 }, { "epoch": 0.142625, "grad_norm": 2.546875, "grad_norm_var": 0.027318318684895832, "learning_rate": 0.0001, "loss": 7.8591, "loss/crossentropy": 1.9665740132331848, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2337390035390854, "step": 2282 }, { "epoch": 0.14275, "grad_norm": 2.828125, "grad_norm_var": 0.024193318684895833, "learning_rate": 0.0001, "loss": 7.7983, "loss/crossentropy": 2.0722063779830933, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23800316452980042, "step": 2284 }, { "epoch": 0.142875, "grad_norm": 2.59375, "grad_norm_var": 0.025093587239583333, "learning_rate": 0.0001, "loss": 7.9698, "loss/crossentropy": 2.3465652465820312, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2537553757429123, "step": 2286 }, { "epoch": 0.143, "grad_norm": 3.375, "grad_norm_var": 0.057938639322916666, "learning_rate": 0.0001, "loss": 7.8197, "loss/crossentropy": 2.4646483659744263, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2531234845519066, "step": 2288 }, { "epoch": 0.143125, "grad_norm": 2.359375, "grad_norm_var": 0.06323140462239583, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.255491614341736, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24858752638101578, "step": 2290 }, { "epoch": 0.14325, "grad_norm": 2.953125, "grad_norm_var": 0.06523030598958333, "learning_rate": 0.0001, "loss": 7.9138, "loss/crossentropy": 2.0624001026153564, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23998911678791046, "step": 2292 }, { "epoch": 0.143375, "grad_norm": 2.609375, "grad_norm_var": 0.06442769368489583, "learning_rate": 0.0001, "loss": 7.9531, "loss/crossentropy": 2.4339091777801514, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2569551467895508, "step": 2294 }, { "epoch": 0.1435, "grad_norm": 2.59375, "grad_norm_var": 0.06369527180989583, "learning_rate": 0.0001, "loss": 7.9041, "loss/crossentropy": 2.0448151230812073, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2471894770860672, "step": 2296 }, { "epoch": 0.143625, "grad_norm": 2.578125, "grad_norm_var": 0.0640625, "learning_rate": 0.0001, "loss": 8.0666, "loss/crossentropy": 2.415607452392578, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2683194726705551, "step": 2298 }, { "epoch": 0.14375, "grad_norm": 2.8125, "grad_norm_var": 0.061930338541666664, "learning_rate": 0.0001, "loss": 7.9572, "loss/crossentropy": 2.294751286506653, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2594631314277649, "step": 2300 }, { "epoch": 0.143875, "grad_norm": 2.8125, "grad_norm_var": 0.06265869140625, "learning_rate": 0.0001, "loss": 7.7507, "loss/crossentropy": 2.219098746776581, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2452593445777893, "step": 2302 }, { "epoch": 0.144, "grad_norm": 2.6875, "grad_norm_var": 0.024833170572916667, "learning_rate": 0.0001, "loss": 7.7452, "loss/crossentropy": 2.273571014404297, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23514091968536377, "step": 2304 }, { "epoch": 0.144125, "grad_norm": 2.8125, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 8.092, "loss/crossentropy": 2.470995545387268, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2565620690584183, "step": 2306 }, { "epoch": 0.14425, "grad_norm": 2.671875, "grad_norm_var": 0.022508748372395835, "learning_rate": 0.0001, "loss": 8.0169, "loss/crossentropy": 2.4442650079727173, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2491839900612831, "step": 2308 }, { "epoch": 0.144375, "grad_norm": 2.515625, "grad_norm_var": 0.024120076497395834, "learning_rate": 0.0001, "loss": 7.8243, "loss/crossentropy": 2.3042315244674683, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24753264337778091, "step": 2310 }, { "epoch": 0.1445, "grad_norm": 2.609375, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 7.8473, "loss/crossentropy": 2.247647523880005, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.242111474275589, "step": 2312 }, { "epoch": 0.144625, "grad_norm": 2.765625, "grad_norm_var": 0.021142578125, "learning_rate": 0.0001, "loss": 7.8245, "loss/crossentropy": 2.0948009490966797, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22648434340953827, "step": 2314 }, { "epoch": 0.14475, "grad_norm": 2.578125, "grad_norm_var": 0.024540201822916666, "learning_rate": 0.0001, "loss": 7.9081, "loss/crossentropy": 2.3449655771255493, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25433051586151123, "step": 2316 }, { "epoch": 0.144875, "grad_norm": 2.40625, "grad_norm_var": 0.03191731770833333, "learning_rate": 0.0001, "loss": 7.8692, "loss/crossentropy": 2.1519018411636353, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23911522328853607, "step": 2318 }, { "epoch": 0.145, "grad_norm": 3.046875, "grad_norm_var": 0.03612874348958333, "learning_rate": 0.0001, "loss": 8.0635, "loss/crossentropy": 2.0561267137527466, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23246531933546066, "step": 2320 }, { "epoch": 0.145125, "grad_norm": 2.578125, "grad_norm_var": 0.031248982747395834, "learning_rate": 0.0001, "loss": 8.0962, "loss/crossentropy": 2.413579821586609, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2741449773311615, "step": 2322 }, { "epoch": 0.14525, "grad_norm": 3.0, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 7.8067, "loss/crossentropy": 2.215983271598816, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24622830748558044, "step": 2324 }, { "epoch": 0.145375, "grad_norm": 2.484375, "grad_norm_var": 0.04117431640625, "learning_rate": 0.0001, "loss": 7.9424, "loss/crossentropy": 2.2071104049682617, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24660057574510574, "step": 2326 }, { "epoch": 0.1455, "grad_norm": 2.890625, "grad_norm_var": 0.0425689697265625, "learning_rate": 0.0001, "loss": 8.0007, "loss/crossentropy": 2.1792843341827393, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2711133062839508, "step": 2328 }, { "epoch": 0.145625, "grad_norm": 2.609375, "grad_norm_var": 0.0416015625, "learning_rate": 0.0001, "loss": 7.9595, "loss/crossentropy": 2.256834030151367, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.26283788681030273, "step": 2330 }, { "epoch": 0.14575, "grad_norm": 2.390625, "grad_norm_var": 0.04439697265625, "learning_rate": 0.0001, "loss": 7.7117, "loss/crossentropy": 1.91128808259964, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23225348442792892, "step": 2332 }, { "epoch": 0.145875, "grad_norm": 2.78125, "grad_norm_var": 0.03492431640625, "learning_rate": 0.0001, "loss": 7.9979, "loss/crossentropy": 2.1611289978027344, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2694767862558365, "step": 2334 }, { "epoch": 0.146, "grad_norm": 2.796875, "grad_norm_var": 0.029423014322916666, "learning_rate": 0.0001, "loss": 7.9771, "loss/crossentropy": 2.3651944398880005, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24654380977153778, "step": 2336 }, { "epoch": 0.146125, "grad_norm": 2.78125, "grad_norm_var": 0.03247782389322917, "learning_rate": 0.0001, "loss": 8.0897, "loss/crossentropy": 2.365579605102539, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26178716123104095, "step": 2338 }, { "epoch": 0.14625, "grad_norm": 2.875, "grad_norm_var": 0.025487263997395832, "learning_rate": 0.0001, "loss": 7.8216, "loss/crossentropy": 2.195146918296814, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2360374853014946, "step": 2340 }, { "epoch": 0.146375, "grad_norm": 2.484375, "grad_norm_var": 0.02750244140625, "learning_rate": 0.0001, "loss": 8.0719, "loss/crossentropy": 2.6680378913879395, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2590227723121643, "step": 2342 }, { "epoch": 0.1465, "grad_norm": 2.765625, "grad_norm_var": 0.025227864583333332, "learning_rate": 0.0001, "loss": 7.9196, "loss/crossentropy": 2.307919979095459, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2748124748468399, "step": 2344 }, { "epoch": 0.146625, "grad_norm": 2.609375, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 8.0198, "loss/crossentropy": 2.478832721710205, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25123097002506256, "step": 2346 }, { "epoch": 0.14675, "grad_norm": 3.109375, "grad_norm_var": 0.025715128580729166, "learning_rate": 0.0001, "loss": 7.998, "loss/crossentropy": 2.1463791131973267, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2501315772533417, "step": 2348 }, { "epoch": 0.146875, "grad_norm": 2.375, "grad_norm_var": 0.0401519775390625, "learning_rate": 0.0001, "loss": 7.9402, "loss/crossentropy": 2.287923812866211, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2529391869902611, "step": 2350 }, { "epoch": 0.147, "grad_norm": 2.84375, "grad_norm_var": 0.04274800618489583, "learning_rate": 0.0001, "loss": 7.8521, "loss/crossentropy": 2.393770456314087, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23246632516384125, "step": 2352 }, { "epoch": 0.147125, "grad_norm": 2.828125, "grad_norm_var": 0.03935445149739583, "learning_rate": 0.0001, "loss": 7.9868, "loss/crossentropy": 1.9886181354522705, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2457761988043785, "step": 2354 }, { "epoch": 0.14725, "grad_norm": 2.734375, "grad_norm_var": 0.0428375244140625, "learning_rate": 0.0001, "loss": 7.9631, "loss/crossentropy": 2.1264249682426453, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.261639803647995, "step": 2356 }, { "epoch": 0.147375, "grad_norm": 2.875, "grad_norm_var": 0.0380767822265625, "learning_rate": 0.0001, "loss": 7.8278, "loss/crossentropy": 2.0697389245033264, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.22177913784980774, "step": 2358 }, { "epoch": 0.1475, "grad_norm": 2.859375, "grad_norm_var": 0.0396484375, "learning_rate": 0.0001, "loss": 7.9523, "loss/crossentropy": 2.355503797531128, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.257536381483078, "step": 2360 }, { "epoch": 0.147625, "grad_norm": 2.484375, "grad_norm_var": 0.0458984375, "learning_rate": 0.0001, "loss": 7.7195, "loss/crossentropy": 2.263971447944641, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2459626868367195, "step": 2362 }, { "epoch": 0.14775, "grad_norm": 2.578125, "grad_norm_var": 0.034886678059895836, "learning_rate": 0.0001, "loss": 7.8342, "loss/crossentropy": 2.3069592714309692, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25156907737255096, "step": 2364 }, { "epoch": 0.147875, "grad_norm": 2.890625, "grad_norm_var": 0.025902303059895833, "learning_rate": 0.0001, "loss": 7.8323, "loss/crossentropy": 2.343958616256714, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23790227621793747, "step": 2366 }, { "epoch": 0.148, "grad_norm": 2.6875, "grad_norm_var": 0.02525634765625, "learning_rate": 0.0001, "loss": 8.0088, "loss/crossentropy": 2.1950390338897705, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25015532970428467, "step": 2368 }, { "epoch": 0.148125, "grad_norm": 2.40625, "grad_norm_var": 0.0327789306640625, "learning_rate": 0.0001, "loss": 7.7179, "loss/crossentropy": 2.18330717086792, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23950626701116562, "step": 2370 }, { "epoch": 0.14825, "grad_norm": 3.109375, "grad_norm_var": 0.04156494140625, "learning_rate": 0.0001, "loss": 7.89, "loss/crossentropy": 2.359447479248047, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25177963823080063, "step": 2372 }, { "epoch": 0.148375, "grad_norm": 2.921875, "grad_norm_var": 0.048075358072916664, "learning_rate": 0.0001, "loss": 7.8616, "loss/crossentropy": 2.1051629185676575, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23466359078884125, "step": 2374 }, { "epoch": 0.1485, "grad_norm": 2.921875, "grad_norm_var": 0.05561421712239583, "learning_rate": 0.0001, "loss": 7.8136, "loss/crossentropy": 2.4187822341918945, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24878371506929398, "step": 2376 }, { "epoch": 0.148625, "grad_norm": 2.703125, "grad_norm_var": 0.05022786458333333, "learning_rate": 0.0001, "loss": 7.8544, "loss/crossentropy": 2.3044979572296143, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23919613659381866, "step": 2378 }, { "epoch": 0.14875, "grad_norm": 2.515625, "grad_norm_var": 0.05283203125, "learning_rate": 0.0001, "loss": 7.9553, "loss/crossentropy": 2.334774613380432, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26303067803382874, "step": 2380 }, { "epoch": 0.148875, "grad_norm": 2.8125, "grad_norm_var": 0.051268513997395834, "learning_rate": 0.0001, "loss": 7.8011, "loss/crossentropy": 2.1512030363082886, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2616318315267563, "step": 2382 }, { "epoch": 0.149, "grad_norm": 2.546875, "grad_norm_var": 0.04903971354166667, "learning_rate": 0.0001, "loss": 7.9319, "loss/crossentropy": 2.1831018924713135, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24666057527065277, "step": 2384 }, { "epoch": 0.149125, "grad_norm": 3.125, "grad_norm_var": 0.05806884765625, "learning_rate": 0.0001, "loss": 8.0404, "loss/crossentropy": 2.278907299041748, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24930745363235474, "step": 2386 }, { "epoch": 0.14925, "grad_norm": 2.46875, "grad_norm_var": 0.04917704264322917, "learning_rate": 0.0001, "loss": 7.8234, "loss/crossentropy": 2.117949962615967, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2466331273317337, "step": 2388 }, { "epoch": 0.149375, "grad_norm": 3.21875, "grad_norm_var": 0.05543212890625, "learning_rate": 0.0001, "loss": 7.9321, "loss/crossentropy": 2.4539263248443604, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2619713842868805, "step": 2390 }, { "epoch": 0.1495, "grad_norm": 2.703125, "grad_norm_var": 0.0503082275390625, "learning_rate": 0.0001, "loss": 8.0366, "loss/crossentropy": 2.155607759952545, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2557392567396164, "step": 2392 }, { "epoch": 0.149625, "grad_norm": 2.828125, "grad_norm_var": 0.0584625244140625, "learning_rate": 0.0001, "loss": 7.8701, "loss/crossentropy": 2.226928472518921, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.227652445435524, "step": 2394 }, { "epoch": 0.14975, "grad_norm": 2.53125, "grad_norm_var": 0.05657552083333333, "learning_rate": 0.0001, "loss": 7.9392, "loss/crossentropy": 2.152814030647278, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22992898523807526, "step": 2396 }, { "epoch": 0.149875, "grad_norm": 2.546875, "grad_norm_var": 0.061258951822916664, "learning_rate": 0.0001, "loss": 7.877, "loss/crossentropy": 2.0306188464164734, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2434871345758438, "step": 2398 }, { "epoch": 0.15, "grad_norm": 2.90625, "grad_norm_var": 0.057112630208333334, "learning_rate": 0.0001, "loss": 7.9455, "loss/crossentropy": 2.3572858572006226, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24068891257047653, "step": 2400 }, { "epoch": 0.150125, "grad_norm": 2.3125, "grad_norm_var": 0.057112630208333334, "learning_rate": 0.0001, "loss": 7.8958, "loss/crossentropy": 2.4808624982833862, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2544522359967232, "step": 2402 }, { "epoch": 0.15025, "grad_norm": 2.765625, "grad_norm_var": 0.055052693684895834, "learning_rate": 0.0001, "loss": 7.8099, "loss/crossentropy": 2.0727401971817017, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25555629283189774, "step": 2404 }, { "epoch": 0.150375, "grad_norm": 2.921875, "grad_norm_var": 0.04289957682291667, "learning_rate": 0.0001, "loss": 7.9492, "loss/crossentropy": 2.3265267610549927, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2484055981040001, "step": 2406 }, { "epoch": 0.1505, "grad_norm": 2.96875, "grad_norm_var": 0.05078837076822917, "learning_rate": 0.0001, "loss": 7.9314, "loss/crossentropy": 2.455584764480591, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2524856925010681, "step": 2408 }, { "epoch": 0.150625, "grad_norm": 2.59375, "grad_norm_var": 0.044694010416666666, "learning_rate": 0.0001, "loss": 7.8646, "loss/crossentropy": 2.425659656524658, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2739051878452301, "step": 2410 }, { "epoch": 0.15075, "grad_norm": 2.5625, "grad_norm_var": 0.042967732747395834, "learning_rate": 0.0001, "loss": 7.9698, "loss/crossentropy": 2.306099534034729, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24583810567855835, "step": 2412 }, { "epoch": 0.150875, "grad_norm": 2.734375, "grad_norm_var": 0.04340718587239583, "learning_rate": 0.0001, "loss": 7.7465, "loss/crossentropy": 2.4911707639694214, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24410546571016312, "step": 2414 }, { "epoch": 0.151, "grad_norm": 2.75, "grad_norm_var": 0.04588114420572917, "learning_rate": 0.0001, "loss": 7.8045, "loss/crossentropy": 2.461613178253174, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2660996913909912, "step": 2416 }, { "epoch": 0.151125, "grad_norm": 2.328125, "grad_norm_var": 0.046708170572916666, "learning_rate": 0.0001, "loss": 7.7771, "loss/crossentropy": 2.1200402975082397, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23757921904325485, "step": 2418 }, { "epoch": 0.15125, "grad_norm": 2.625, "grad_norm_var": 0.048029581705729164, "learning_rate": 0.0001, "loss": 7.8634, "loss/crossentropy": 2.0594701766967773, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2412218227982521, "step": 2420 }, { "epoch": 0.151375, "grad_norm": 3.390625, "grad_norm_var": 0.09226786295572917, "learning_rate": 0.0001, "loss": 7.9832, "loss/crossentropy": 2.171906590461731, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24810399115085602, "step": 2422 }, { "epoch": 0.1515, "grad_norm": 2.328125, "grad_norm_var": 0.09123942057291666, "learning_rate": 0.0001, "loss": 7.8273, "loss/crossentropy": 2.5124112367630005, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2484402135014534, "step": 2424 }, { "epoch": 0.151625, "grad_norm": 2.703125, "grad_norm_var": 0.0913726806640625, "learning_rate": 0.0001, "loss": 7.782, "loss/crossentropy": 2.3001959323883057, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24032824486494064, "step": 2426 }, { "epoch": 0.15175, "grad_norm": 2.546875, "grad_norm_var": 0.0932525634765625, "learning_rate": 0.0001, "loss": 7.7919, "loss/crossentropy": 2.170712888240814, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.26182495057582855, "step": 2428 }, { "epoch": 0.151875, "grad_norm": 2.84375, "grad_norm_var": 0.093359375, "learning_rate": 0.0001, "loss": 7.9205, "loss/crossentropy": 2.2728850841522217, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2512703761458397, "step": 2430 }, { "epoch": 0.152, "grad_norm": 2.46875, "grad_norm_var": 0.0877838134765625, "learning_rate": 0.0001, "loss": 7.7564, "loss/crossentropy": 2.2336456775665283, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24472886323928833, "step": 2432 }, { "epoch": 0.152125, "grad_norm": 2.75, "grad_norm_var": 0.07998758951822917, "learning_rate": 0.0001, "loss": 7.792, "loss/crossentropy": 2.0555617809295654, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.21662656217813492, "step": 2434 }, { "epoch": 0.15225, "grad_norm": 2.6875, "grad_norm_var": 0.07681376139322917, "learning_rate": 0.0001, "loss": 7.852, "loss/crossentropy": 2.1744298934936523, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25150124728679657, "step": 2436 }, { "epoch": 0.152375, "grad_norm": 2.78125, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 7.7934, "loss/crossentropy": 2.0990302562713623, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22628428786993027, "step": 2438 }, { "epoch": 0.1525, "grad_norm": 2.671875, "grad_norm_var": 0.011844889322916666, "learning_rate": 0.0001, "loss": 7.8081, "loss/crossentropy": 2.2353204488754272, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26493804156780243, "step": 2440 }, { "epoch": 0.152625, "grad_norm": 2.953125, "grad_norm_var": 0.017704264322916666, "learning_rate": 0.0001, "loss": 8.0923, "loss/crossentropy": 2.281570076942444, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2562706768512726, "step": 2442 }, { "epoch": 0.15275, "grad_norm": 2.34375, "grad_norm_var": 0.0250396728515625, "learning_rate": 0.0001, "loss": 7.773, "loss/crossentropy": 2.251350522041321, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2373836562037468, "step": 2444 }, { "epoch": 0.152875, "grad_norm": 3.34375, "grad_norm_var": 0.0528472900390625, "learning_rate": 0.0001, "loss": 7.976, "loss/crossentropy": 2.2960145473480225, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25969095528125763, "step": 2446 }, { "epoch": 0.153, "grad_norm": 2.28125, "grad_norm_var": 0.05984700520833333, "learning_rate": 0.0001, "loss": 7.8294, "loss/crossentropy": 2.28712797164917, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2389351725578308, "step": 2448 }, { "epoch": 0.153125, "grad_norm": 2.890625, "grad_norm_var": 0.06357421875, "learning_rate": 0.0001, "loss": 7.9498, "loss/crossentropy": 2.170054316520691, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24584101140499115, "step": 2450 }, { "epoch": 0.15325, "grad_norm": 2.78125, "grad_norm_var": 0.07652587890625, "learning_rate": 0.0001, "loss": 8.0025, "loss/crossentropy": 2.2106114625930786, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24998192489147186, "step": 2452 }, { "epoch": 0.153375, "grad_norm": 2.4375, "grad_norm_var": 0.07939046223958333, "learning_rate": 0.0001, "loss": 7.7971, "loss/crossentropy": 2.2459070682525635, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24219267070293427, "step": 2454 }, { "epoch": 0.1535, "grad_norm": 2.59375, "grad_norm_var": 0.08028971354166667, "learning_rate": 0.0001, "loss": 7.8238, "loss/crossentropy": 2.3142576217651367, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25120753794908524, "step": 2456 }, { "epoch": 0.153625, "grad_norm": 2.640625, "grad_norm_var": 0.0760406494140625, "learning_rate": 0.0001, "loss": 7.9069, "loss/crossentropy": 2.390681028366089, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2660007178783417, "step": 2458 }, { "epoch": 0.15375, "grad_norm": 2.6875, "grad_norm_var": 0.07095947265625, "learning_rate": 0.0001, "loss": 7.9599, "loss/crossentropy": 2.3622519969940186, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2376810610294342, "step": 2460 }, { "epoch": 0.153875, "grad_norm": 2.765625, "grad_norm_var": 0.05569254557291667, "learning_rate": 0.0001, "loss": 7.7633, "loss/crossentropy": 2.587849259376526, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2587762326002121, "step": 2462 }, { "epoch": 0.154, "grad_norm": 2.5625, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 8.0367, "loss/crossentropy": 2.0119821429252625, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23865149170160294, "step": 2464 }, { "epoch": 0.154125, "grad_norm": 2.578125, "grad_norm_var": 0.04401041666666667, "learning_rate": 0.0001, "loss": 7.9178, "loss/crossentropy": 2.278993248939514, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2538345381617546, "step": 2466 }, { "epoch": 0.15425, "grad_norm": 2.578125, "grad_norm_var": 0.029352823893229168, "learning_rate": 0.0001, "loss": 7.8351, "loss/crossentropy": 2.2865071296691895, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24299630522727966, "step": 2468 }, { "epoch": 0.154375, "grad_norm": 2.796875, "grad_norm_var": 0.027497355143229166, "learning_rate": 0.0001, "loss": 7.7031, "loss/crossentropy": 2.118988275527954, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23056157678365707, "step": 2470 }, { "epoch": 0.1545, "grad_norm": 2.25, "grad_norm_var": 0.0398101806640625, "learning_rate": 0.0001, "loss": 7.7647, "loss/crossentropy": 2.122451901435852, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2540634050965309, "step": 2472 }, { "epoch": 0.154625, "grad_norm": 2.875, "grad_norm_var": 0.041845703125, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.2460381984710693, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2485114336013794, "step": 2474 }, { "epoch": 0.15475, "grad_norm": 2.828125, "grad_norm_var": 0.0369049072265625, "learning_rate": 0.0001, "loss": 7.8238, "loss/crossentropy": 2.2321892976760864, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24692068994045258, "step": 2476 }, { "epoch": 0.154875, "grad_norm": 2.671875, "grad_norm_var": 0.0273834228515625, "learning_rate": 0.0001, "loss": 7.8107, "loss/crossentropy": 2.47454035282135, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24419991672039032, "step": 2478 }, { "epoch": 0.155, "grad_norm": 2.703125, "grad_norm_var": 0.026399739583333335, "learning_rate": 0.0001, "loss": 7.782, "loss/crossentropy": 2.1783688068389893, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24744782596826553, "step": 2480 }, { "epoch": 0.155125, "grad_norm": 2.671875, "grad_norm_var": 0.029524739583333334, "learning_rate": 0.0001, "loss": 7.9437, "loss/crossentropy": 2.553811550140381, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2530565932393074, "step": 2482 }, { "epoch": 0.15525, "grad_norm": 2.53125, "grad_norm_var": 0.04010009765625, "learning_rate": 0.0001, "loss": 7.8713, "loss/crossentropy": 2.441239356994629, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2551605701446533, "step": 2484 }, { "epoch": 0.155375, "grad_norm": 2.5625, "grad_norm_var": 0.038309733072916664, "learning_rate": 0.0001, "loss": 7.7061, "loss/crossentropy": 2.370589256286621, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25826428830623627, "step": 2486 }, { "epoch": 0.1555, "grad_norm": 2.640625, "grad_norm_var": 0.026390584309895833, "learning_rate": 0.0001, "loss": 7.7961, "loss/crossentropy": 2.1814417839050293, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2495630532503128, "step": 2488 }, { "epoch": 0.155625, "grad_norm": 2.890625, "grad_norm_var": 0.029173787434895834, "learning_rate": 0.0001, "loss": 7.8197, "loss/crossentropy": 2.4883482456207275, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.26348088681697845, "step": 2490 }, { "epoch": 0.15575, "grad_norm": 2.578125, "grad_norm_var": 0.027632649739583334, "learning_rate": 0.0001, "loss": 7.8219, "loss/crossentropy": 2.1873401403427124, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24557159841060638, "step": 2492 }, { "epoch": 0.155875, "grad_norm": 2.75, "grad_norm_var": 0.02867431640625, "learning_rate": 0.0001, "loss": 7.8562, "loss/crossentropy": 2.3110402822494507, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24427203088998795, "step": 2494 }, { "epoch": 0.156, "grad_norm": 2.953125, "grad_norm_var": 0.03600260416666667, "learning_rate": 0.0001, "loss": 7.8415, "loss/crossentropy": 2.2570624351501465, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24605220556259155, "step": 2496 }, { "epoch": 0.156125, "grad_norm": 2.3125, "grad_norm_var": 0.04087626139322917, "learning_rate": 0.0001, "loss": 7.6643, "loss/crossentropy": 2.2185534238815308, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2364451214671135, "step": 2498 }, { "epoch": 0.15625, "grad_norm": 3.078125, "grad_norm_var": 0.039948527018229166, "learning_rate": 0.0001, "loss": 7.8161, "loss/crossentropy": 2.2274473905563354, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24973751604557037, "step": 2500 }, { "epoch": 0.156375, "grad_norm": 2.46875, "grad_norm_var": 0.042292277018229164, "learning_rate": 0.0001, "loss": 8.0038, "loss/crossentropy": 2.3013203144073486, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24737702310085297, "step": 2502 }, { "epoch": 0.1565, "grad_norm": 2.484375, "grad_norm_var": 0.0455078125, "learning_rate": 0.0001, "loss": 7.7928, "loss/crossentropy": 2.5051584243774414, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25412553548812866, "step": 2504 }, { "epoch": 0.156625, "grad_norm": 2.890625, "grad_norm_var": 0.0430084228515625, "learning_rate": 0.0001, "loss": 7.9481, "loss/crossentropy": 2.189783751964569, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24579951167106628, "step": 2506 }, { "epoch": 0.15675, "grad_norm": 2.796875, "grad_norm_var": 0.0482818603515625, "learning_rate": 0.0001, "loss": 7.8348, "loss/crossentropy": 2.629545569419861, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26841507852077484, "step": 2508 }, { "epoch": 0.156875, "grad_norm": 2.34375, "grad_norm_var": 0.060212198893229166, "learning_rate": 0.0001, "loss": 7.7123, "loss/crossentropy": 2.267096519470215, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24167770147323608, "step": 2510 }, { "epoch": 0.157, "grad_norm": 2.546875, "grad_norm_var": 0.05322977701822917, "learning_rate": 0.0001, "loss": 7.6795, "loss/crossentropy": 2.170002818107605, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2375175580382347, "step": 2512 }, { "epoch": 0.157125, "grad_norm": 2.890625, "grad_norm_var": 0.05366109212239583, "learning_rate": 0.0001, "loss": 7.8975, "loss/crossentropy": 2.1839526891708374, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24612244218587875, "step": 2514 }, { "epoch": 0.15725, "grad_norm": 2.484375, "grad_norm_var": 0.0496734619140625, "learning_rate": 0.0001, "loss": 7.7503, "loss/crossentropy": 1.7978224754333496, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22572653740644455, "step": 2516 }, { "epoch": 0.157375, "grad_norm": 2.9375, "grad_norm_var": 0.052567545572916666, "learning_rate": 0.0001, "loss": 7.9403, "loss/crossentropy": 2.2894846200942993, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24215564131736755, "step": 2518 }, { "epoch": 0.1575, "grad_norm": 2.578125, "grad_norm_var": 0.052469889322916664, "learning_rate": 0.0001, "loss": 8.0173, "loss/crossentropy": 2.271655559539795, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23966332525014877, "step": 2520 }, { "epoch": 0.157625, "grad_norm": 2.640625, "grad_norm_var": 0.051070149739583334, "learning_rate": 0.0001, "loss": 8.027, "loss/crossentropy": 2.30005145072937, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2628418430685997, "step": 2522 }, { "epoch": 0.15775, "grad_norm": 2.65625, "grad_norm_var": 0.03728841145833333, "learning_rate": 0.0001, "loss": 7.5896, "loss/crossentropy": 2.0799155235290527, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23638420552015305, "step": 2524 }, { "epoch": 0.157875, "grad_norm": 2.859375, "grad_norm_var": 0.03769124348958333, "learning_rate": 0.0001, "loss": 7.9816, "loss/crossentropy": 2.13996684551239, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24061943590641022, "step": 2526 }, { "epoch": 0.158, "grad_norm": 2.546875, "grad_norm_var": 0.03584696451822917, "learning_rate": 0.0001, "loss": 7.89, "loss/crossentropy": 2.4007346630096436, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2567130923271179, "step": 2528 }, { "epoch": 0.158125, "grad_norm": 2.859375, "grad_norm_var": 0.0336578369140625, "learning_rate": 0.0001, "loss": 7.8945, "loss/crossentropy": 2.3631176948547363, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2415386661887169, "step": 2530 }, { "epoch": 0.15825, "grad_norm": 2.5625, "grad_norm_var": 0.02857666015625, "learning_rate": 0.0001, "loss": 7.7907, "loss/crossentropy": 2.058986485004425, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24685738235712051, "step": 2532 }, { "epoch": 0.158375, "grad_norm": 2.5, "grad_norm_var": 0.02593994140625, "learning_rate": 0.0001, "loss": 7.8062, "loss/crossentropy": 2.2281254529953003, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24689025431871414, "step": 2534 }, { "epoch": 0.1585, "grad_norm": 2.5625, "grad_norm_var": 0.023127237955729168, "learning_rate": 0.0001, "loss": 7.8028, "loss/crossentropy": 2.4382470846176147, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27194739878177643, "step": 2536 }, { "epoch": 0.158625, "grad_norm": 2.96875, "grad_norm_var": 0.028083292643229167, "learning_rate": 0.0001, "loss": 7.8523, "loss/crossentropy": 2.2935184240341187, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23303698748350143, "step": 2538 }, { "epoch": 0.15875, "grad_norm": 2.578125, "grad_norm_var": 0.0289703369140625, "learning_rate": 0.0001, "loss": 7.8038, "loss/crossentropy": 2.43733286857605, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23855997622013092, "step": 2540 }, { "epoch": 0.158875, "grad_norm": 2.796875, "grad_norm_var": 0.020894368489583332, "learning_rate": 0.0001, "loss": 7.8183, "loss/crossentropy": 2.318352222442627, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2344244346022606, "step": 2542 }, { "epoch": 0.159, "grad_norm": 2.546875, "grad_norm_var": 0.021468098958333334, "learning_rate": 0.0001, "loss": 7.7506, "loss/crossentropy": 2.172747015953064, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23044558614492416, "step": 2544 }, { "epoch": 0.159125, "grad_norm": 2.625, "grad_norm_var": 0.017024739583333334, "learning_rate": 0.0001, "loss": 7.7647, "loss/crossentropy": 2.2639771699905396, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2624407559633255, "step": 2546 }, { "epoch": 0.15925, "grad_norm": 2.59375, "grad_norm_var": 0.05847981770833333, "learning_rate": 0.0001, "loss": 7.8837, "loss/crossentropy": 2.484058380126953, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2976628988981247, "step": 2548 }, { "epoch": 0.159375, "grad_norm": 2.671875, "grad_norm_var": 0.05621337890625, "learning_rate": 0.0001, "loss": 7.9374, "loss/crossentropy": 2.439212441444397, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23525572568178177, "step": 2550 }, { "epoch": 0.1595, "grad_norm": 2.6875, "grad_norm_var": 0.061009724934895836, "learning_rate": 0.0001, "loss": 7.6554, "loss/crossentropy": 2.046541452407837, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23553012311458588, "step": 2552 }, { "epoch": 0.159625, "grad_norm": 2.65625, "grad_norm_var": 0.058430989583333336, "learning_rate": 0.0001, "loss": 7.679, "loss/crossentropy": 2.027937591075897, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24199260026216507, "step": 2554 }, { "epoch": 0.15975, "grad_norm": 2.4375, "grad_norm_var": 0.06506245930989583, "learning_rate": 0.0001, "loss": 7.6605, "loss/crossentropy": 1.922214150428772, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21722379326820374, "step": 2556 }, { "epoch": 0.159875, "grad_norm": 2.796875, "grad_norm_var": 0.06368815104166667, "learning_rate": 0.0001, "loss": 7.947, "loss/crossentropy": 2.41329824924469, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23701953887939453, "step": 2558 }, { "epoch": 0.16, "grad_norm": 2.625, "grad_norm_var": 0.06347249348958334, "learning_rate": 0.0001, "loss": 7.9478, "loss/crossentropy": 2.364239811897278, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.26914364099502563, "step": 2560 }, { "epoch": 0.160125, "grad_norm": 2.53125, "grad_norm_var": 0.06280924479166666, "learning_rate": 0.0001, "loss": 7.8012, "loss/crossentropy": 2.2895541191101074, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2240670844912529, "step": 2562 }, { "epoch": 0.16025, "grad_norm": 2.59375, "grad_norm_var": 0.022858683268229166, "learning_rate": 0.0001, "loss": 7.9514, "loss/crossentropy": 2.21063768863678, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23704807460308075, "step": 2564 }, { "epoch": 0.160375, "grad_norm": 2.421875, "grad_norm_var": 0.026102701822916668, "learning_rate": 0.0001, "loss": 7.9352, "loss/crossentropy": 2.1890910863876343, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25638218969106674, "step": 2566 }, { "epoch": 0.1605, "grad_norm": 2.734375, "grad_norm_var": 0.023014322916666666, "learning_rate": 0.0001, "loss": 7.7254, "loss/crossentropy": 2.3455424308776855, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22738848626613617, "step": 2568 }, { "epoch": 0.160625, "grad_norm": 2.640625, "grad_norm_var": 0.018830362955729166, "learning_rate": 0.0001, "loss": 7.9244, "loss/crossentropy": 2.4760804176330566, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2477440983057022, "step": 2570 }, { "epoch": 0.16075, "grad_norm": 2.78125, "grad_norm_var": 0.0157379150390625, "learning_rate": 0.0001, "loss": 7.8971, "loss/crossentropy": 2.2207542657852173, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23943090438842773, "step": 2572 }, { "epoch": 0.160875, "grad_norm": 2.796875, "grad_norm_var": 0.0168609619140625, "learning_rate": 0.0001, "loss": 7.8168, "loss/crossentropy": 2.5181933641433716, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2479179948568344, "step": 2574 }, { "epoch": 0.161, "grad_norm": 2.75, "grad_norm_var": 0.015721638997395832, "learning_rate": 0.0001, "loss": 7.7453, "loss/crossentropy": 2.37592613697052, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24624846875667572, "step": 2576 }, { "epoch": 0.161125, "grad_norm": 3.15625, "grad_norm_var": 0.030204264322916667, "learning_rate": 0.0001, "loss": 7.8473, "loss/crossentropy": 2.2562596797943115, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2466205209493637, "step": 2578 }, { "epoch": 0.16125, "grad_norm": 2.625, "grad_norm_var": 0.029866536458333332, "learning_rate": 0.0001, "loss": 7.8813, "loss/crossentropy": 2.23412823677063, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2418230101466179, "step": 2580 }, { "epoch": 0.161375, "grad_norm": 2.625, "grad_norm_var": 0.023030598958333332, "learning_rate": 0.0001, "loss": 7.9116, "loss/crossentropy": 2.529879093170166, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2786422669887543, "step": 2582 }, { "epoch": 0.1615, "grad_norm": 2.515625, "grad_norm_var": 0.0252593994140625, "learning_rate": 0.0001, "loss": 7.7615, "loss/crossentropy": 2.1202937364578247, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24669316411018372, "step": 2584 }, { "epoch": 0.161625, "grad_norm": 2.59375, "grad_norm_var": 0.03206278483072917, "learning_rate": 0.0001, "loss": 7.7169, "loss/crossentropy": 2.2816332578659058, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23847512155771255, "step": 2586 }, { "epoch": 0.16175, "grad_norm": 2.828125, "grad_norm_var": 0.032136027018229166, "learning_rate": 0.0001, "loss": 7.7928, "loss/crossentropy": 2.264583945274353, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23760029673576355, "step": 2588 }, { "epoch": 0.161875, "grad_norm": 2.4375, "grad_norm_var": 0.03433837890625, "learning_rate": 0.0001, "loss": 7.8559, "loss/crossentropy": 2.070025682449341, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2591887414455414, "step": 2590 }, { "epoch": 0.162, "grad_norm": 2.453125, "grad_norm_var": 0.039388020833333336, "learning_rate": 0.0001, "loss": 7.8661, "loss/crossentropy": 2.3117960691452026, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23791569471359253, "step": 2592 }, { "epoch": 0.162125, "grad_norm": 2.5625, "grad_norm_var": 0.0252593994140625, "learning_rate": 0.0001, "loss": 7.4415, "loss/crossentropy": 2.2540050745010376, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22730688750743866, "step": 2594 }, { "epoch": 0.16225, "grad_norm": 2.703125, "grad_norm_var": 0.03404947916666667, "learning_rate": 0.0001, "loss": 7.6522, "loss/crossentropy": 1.7815396785736084, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2086879387497902, "step": 2596 }, { "epoch": 0.162375, "grad_norm": 2.625, "grad_norm_var": 0.03359375, "learning_rate": 0.0001, "loss": 7.7545, "loss/crossentropy": 2.38827908039093, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24543824791908264, "step": 2598 }, { "epoch": 0.1625, "grad_norm": 2.84375, "grad_norm_var": 0.03720703125, "learning_rate": 0.0001, "loss": 7.8079, "loss/crossentropy": 2.4880915880203247, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2978169023990631, "step": 2600 }, { "epoch": 0.162625, "grad_norm": 2.515625, "grad_norm_var": 0.03308817545572917, "learning_rate": 0.0001, "loss": 7.8877, "loss/crossentropy": 2.4218918085098267, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24140693247318268, "step": 2602 }, { "epoch": 0.16275, "grad_norm": 2.53125, "grad_norm_var": 0.031053670247395835, "learning_rate": 0.0001, "loss": 7.8144, "loss/crossentropy": 2.266621947288513, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26293135434389114, "step": 2604 }, { "epoch": 0.162875, "grad_norm": 2.578125, "grad_norm_var": 0.028499348958333334, "learning_rate": 0.0001, "loss": 7.7038, "loss/crossentropy": 2.1036359071731567, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2307727113366127, "step": 2606 }, { "epoch": 0.163, "grad_norm": 2.6875, "grad_norm_var": 0.04426981608072917, "learning_rate": 0.0001, "loss": 7.8715, "loss/crossentropy": 2.2452776432037354, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26242855191230774, "step": 2608 }, { "epoch": 0.163125, "grad_norm": 2.46875, "grad_norm_var": 0.04185791015625, "learning_rate": 0.0001, "loss": 7.8903, "loss/crossentropy": 2.2557637691497803, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.272522896528244, "step": 2610 }, { "epoch": 0.16325, "grad_norm": 2.65625, "grad_norm_var": 0.03326416015625, "learning_rate": 0.0001, "loss": 7.9233, "loss/crossentropy": 2.375272512435913, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2328314259648323, "step": 2612 }, { "epoch": 0.163375, "grad_norm": 2.984375, "grad_norm_var": 0.03937886555989583, "learning_rate": 0.0001, "loss": 7.8084, "loss/crossentropy": 2.2634716033935547, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2391228973865509, "step": 2614 }, { "epoch": 0.1635, "grad_norm": 2.75, "grad_norm_var": 0.04064127604166667, "learning_rate": 0.0001, "loss": 7.888, "loss/crossentropy": 2.5287814140319824, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2443351447582245, "step": 2616 }, { "epoch": 0.163625, "grad_norm": 2.375, "grad_norm_var": 0.04646708170572917, "learning_rate": 0.0001, "loss": 7.6805, "loss/crossentropy": 2.142694592475891, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24597708880901337, "step": 2618 }, { "epoch": 0.16375, "grad_norm": 3.25, "grad_norm_var": 0.06467997233072917, "learning_rate": 0.0001, "loss": 7.8287, "loss/crossentropy": 1.996739685535431, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25455768406391144, "step": 2620 }, { "epoch": 0.163875, "grad_norm": 2.40625, "grad_norm_var": 0.06575419108072916, "learning_rate": 0.0001, "loss": 7.8684, "loss/crossentropy": 2.460938572883606, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25081367045640945, "step": 2622 }, { "epoch": 0.164, "grad_norm": 2.828125, "grad_norm_var": 0.05423177083333333, "learning_rate": 0.0001, "loss": 7.8841, "loss/crossentropy": 2.4802383184432983, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24186843633651733, "step": 2624 }, { "epoch": 0.164125, "grad_norm": 2.609375, "grad_norm_var": 0.05076395670572917, "learning_rate": 0.0001, "loss": 7.8454, "loss/crossentropy": 2.209325075149536, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.237510085105896, "step": 2626 }, { "epoch": 0.16425, "grad_norm": 2.28125, "grad_norm_var": 0.06262613932291666, "learning_rate": 0.0001, "loss": 7.7406, "loss/crossentropy": 2.0401015281677246, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.20650795102119446, "step": 2628 }, { "epoch": 0.164375, "grad_norm": 2.890625, "grad_norm_var": 0.06002604166666667, "learning_rate": 0.0001, "loss": 7.894, "loss/crossentropy": 2.5026817321777344, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24748124927282333, "step": 2630 }, { "epoch": 0.1645, "grad_norm": 2.34375, "grad_norm_var": 0.0602691650390625, "learning_rate": 0.0001, "loss": 7.7024, "loss/crossentropy": 2.2391568422317505, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22500373423099518, "step": 2632 }, { "epoch": 0.164625, "grad_norm": 2.578125, "grad_norm_var": 0.05440165201822917, "learning_rate": 0.0001, "loss": 7.7876, "loss/crossentropy": 2.2685747742652893, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2358316034078598, "step": 2634 }, { "epoch": 0.16475, "grad_norm": 2.703125, "grad_norm_var": 0.027220662434895834, "learning_rate": 0.0001, "loss": 7.7126, "loss/crossentropy": 2.2785946130752563, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24026557803153992, "step": 2636 }, { "epoch": 0.164875, "grad_norm": 2.671875, "grad_norm_var": 0.024898274739583334, "learning_rate": 0.0001, "loss": 7.8153, "loss/crossentropy": 2.2721141576766968, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2398277372121811, "step": 2638 }, { "epoch": 0.165, "grad_norm": 2.5625, "grad_norm_var": 0.023224894205729166, "learning_rate": 0.0001, "loss": 7.9432, "loss/crossentropy": 2.229793667793274, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.21887121349573135, "step": 2640 }, { "epoch": 0.165125, "grad_norm": 2.4375, "grad_norm_var": 0.023876953125, "learning_rate": 0.0001, "loss": 7.603, "loss/crossentropy": 2.1890532970428467, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23123417794704437, "step": 2642 }, { "epoch": 0.16525, "grad_norm": 2.53125, "grad_norm_var": 0.0223052978515625, "learning_rate": 0.0001, "loss": 7.6917, "loss/crossentropy": 2.172744870185852, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22291851788759232, "step": 2644 }, { "epoch": 0.165375, "grad_norm": 2.75, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 7.6527, "loss/crossentropy": 2.2864272594451904, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24596457928419113, "step": 2646 }, { "epoch": 0.1655, "grad_norm": 2.53125, "grad_norm_var": 0.01695556640625, "learning_rate": 0.0001, "loss": 7.858, "loss/crossentropy": 2.428277611732483, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25081363320350647, "step": 2648 }, { "epoch": 0.165625, "grad_norm": 2.671875, "grad_norm_var": 0.017508951822916667, "learning_rate": 0.0001, "loss": 7.8198, "loss/crossentropy": 2.2622756958007812, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24526074528694153, "step": 2650 }, { "epoch": 0.16575, "grad_norm": 2.453125, "grad_norm_var": 0.0265289306640625, "learning_rate": 0.0001, "loss": 7.6316, "loss/crossentropy": 2.1581307649612427, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24160084128379822, "step": 2652 }, { "epoch": 0.165875, "grad_norm": 2.78125, "grad_norm_var": 0.0283203125, "learning_rate": 0.0001, "loss": 7.8289, "loss/crossentropy": 2.2180778980255127, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24087932705879211, "step": 2654 }, { "epoch": 0.166, "grad_norm": 2.796875, "grad_norm_var": 0.030939737955729168, "learning_rate": 0.0001, "loss": 7.8398, "loss/crossentropy": 2.3046650886535645, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25179338455200195, "step": 2656 }, { "epoch": 0.166125, "grad_norm": 2.796875, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 7.7888, "loss/crossentropy": 2.251810073852539, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23383210599422455, "step": 2658 }, { "epoch": 0.16625, "grad_norm": 2.515625, "grad_norm_var": 0.0210357666015625, "learning_rate": 0.0001, "loss": 7.6108, "loss/crossentropy": 2.326148748397827, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.267700731754303, "step": 2660 }, { "epoch": 0.166375, "grad_norm": 2.640625, "grad_norm_var": 0.021410115559895835, "learning_rate": 0.0001, "loss": 7.6615, "loss/crossentropy": 2.3175487518310547, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2537999600172043, "step": 2662 }, { "epoch": 0.1665, "grad_norm": 2.609375, "grad_norm_var": 0.019310506184895833, "learning_rate": 0.0001, "loss": 7.7319, "loss/crossentropy": 2.4394067525863647, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24439330399036407, "step": 2664 }, { "epoch": 0.166625, "grad_norm": 2.5, "grad_norm_var": 0.024934895833333335, "learning_rate": 0.0001, "loss": 7.7306, "loss/crossentropy": 2.377542495727539, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2411457523703575, "step": 2666 }, { "epoch": 0.16675, "grad_norm": 2.96875, "grad_norm_var": 0.026097615559895832, "learning_rate": 0.0001, "loss": 7.7291, "loss/crossentropy": 2.244265556335449, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2636634260416031, "step": 2668 }, { "epoch": 0.166875, "grad_norm": 2.40625, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 7.817, "loss/crossentropy": 2.3067715167999268, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2648337334394455, "step": 2670 }, { "epoch": 0.167, "grad_norm": 2.78125, "grad_norm_var": 0.02916259765625, "learning_rate": 0.0001, "loss": 7.9386, "loss/crossentropy": 2.3284155130386353, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25726044178009033, "step": 2672 }, { "epoch": 0.167125, "grad_norm": 2.609375, "grad_norm_var": 0.03279520670572917, "learning_rate": 0.0001, "loss": 7.6504, "loss/crossentropy": 2.1608939170837402, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23987659811973572, "step": 2674 }, { "epoch": 0.16725, "grad_norm": 3.0625, "grad_norm_var": 0.04436442057291667, "learning_rate": 0.0001, "loss": 7.7824, "loss/crossentropy": 2.156682312488556, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22545601427555084, "step": 2676 }, { "epoch": 0.167375, "grad_norm": 2.765625, "grad_norm_var": 0.07141520182291666, "learning_rate": 0.0001, "loss": 7.8573, "loss/crossentropy": 2.1365780234336853, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.21754685044288635, "step": 2678 }, { "epoch": 0.1675, "grad_norm": 2.484375, "grad_norm_var": 0.07617899576822916, "learning_rate": 0.0001, "loss": 7.9647, "loss/crossentropy": 2.27071213722229, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22438976168632507, "step": 2680 }, { "epoch": 0.167625, "grad_norm": 2.765625, "grad_norm_var": 0.0685455322265625, "learning_rate": 0.0001, "loss": 7.7773, "loss/crossentropy": 2.115522563457489, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22477930784225464, "step": 2682 }, { "epoch": 0.16775, "grad_norm": 2.46875, "grad_norm_var": 0.06461181640625, "learning_rate": 0.0001, "loss": 7.7755, "loss/crossentropy": 2.215229034423828, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22990095615386963, "step": 2684 }, { "epoch": 0.167875, "grad_norm": 2.484375, "grad_norm_var": 0.06243082682291667, "learning_rate": 0.0001, "loss": 7.7615, "loss/crossentropy": 2.180325746536255, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.232216514647007, "step": 2686 }, { "epoch": 0.168, "grad_norm": 2.453125, "grad_norm_var": 0.0634185791015625, "learning_rate": 0.0001, "loss": 7.713, "loss/crossentropy": 2.4484771490097046, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2512357458472252, "step": 2688 }, { "epoch": 0.168125, "grad_norm": 2.5625, "grad_norm_var": 0.0561431884765625, "learning_rate": 0.0001, "loss": 7.7087, "loss/crossentropy": 2.178835153579712, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24023275077342987, "step": 2690 }, { "epoch": 0.16825, "grad_norm": 2.53125, "grad_norm_var": 0.045750935872395836, "learning_rate": 0.0001, "loss": 7.6464, "loss/crossentropy": 2.0620937943458557, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23334022611379623, "step": 2692 }, { "epoch": 0.168375, "grad_norm": 2.53125, "grad_norm_var": 0.009740193684895834, "learning_rate": 0.0001, "loss": 7.8639, "loss/crossentropy": 2.4087116718292236, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2341567426919937, "step": 2694 }, { "epoch": 0.1685, "grad_norm": 2.890625, "grad_norm_var": 0.014647420247395833, "learning_rate": 0.0001, "loss": 7.9106, "loss/crossentropy": 2.309167981147766, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24564050883054733, "step": 2696 }, { "epoch": 0.168625, "grad_norm": 2.28125, "grad_norm_var": 0.016364542643229167, "learning_rate": 0.0001, "loss": 7.6039, "loss/crossentropy": 1.9676810503005981, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2102392315864563, "step": 2698 }, { "epoch": 0.16875, "grad_norm": 2.65625, "grad_norm_var": 0.017118326822916665, "learning_rate": 0.0001, "loss": 7.7754, "loss/crossentropy": 2.380856513977051, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2278568521142006, "step": 2700 }, { "epoch": 0.168875, "grad_norm": 2.828125, "grad_norm_var": 0.020335896809895834, "learning_rate": 0.0001, "loss": 7.8029, "loss/crossentropy": 2.2248435020446777, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23989464342594147, "step": 2702 }, { "epoch": 0.169, "grad_norm": 2.65625, "grad_norm_var": 0.0283355712890625, "learning_rate": 0.0001, "loss": 7.7909, "loss/crossentropy": 2.2802765369415283, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23772113770246506, "step": 2704 }, { "epoch": 0.169125, "grad_norm": 2.640625, "grad_norm_var": 0.02998046875, "learning_rate": 0.0001, "loss": 7.8392, "loss/crossentropy": 2.320490837097168, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.232273131608963, "step": 2706 }, { "epoch": 0.16925, "grad_norm": 2.34375, "grad_norm_var": 0.036279296875, "learning_rate": 0.0001, "loss": 7.7277, "loss/crossentropy": 2.103710889816284, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.21974950283765793, "step": 2708 }, { "epoch": 0.169375, "grad_norm": 2.90625, "grad_norm_var": 0.0538726806640625, "learning_rate": 0.0001, "loss": 7.9849, "loss/crossentropy": 2.311343789100647, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27306586503982544, "step": 2710 }, { "epoch": 0.1695, "grad_norm": 4.21875, "grad_norm_var": 0.21113993326822916, "learning_rate": 0.0001, "loss": 7.898, "loss/crossentropy": 2.248456120491028, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23485350608825684, "step": 2712 }, { "epoch": 0.169625, "grad_norm": 3.640625, "grad_norm_var": 0.2821116129557292, "learning_rate": 0.0001, "loss": 8.0907, "loss/crossentropy": 2.3589508533477783, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23614779859781265, "step": 2714 }, { "epoch": 0.16975, "grad_norm": 2.9375, "grad_norm_var": 0.29221903483072914, "learning_rate": 0.0001, "loss": 7.8759, "loss/crossentropy": 2.472365975379944, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23715022206306458, "step": 2716 }, { "epoch": 0.169875, "grad_norm": 2.328125, "grad_norm_var": 0.3374420166015625, "learning_rate": 0.0001, "loss": 7.7942, "loss/crossentropy": 2.264963388442993, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24661238491535187, "step": 2718 }, { "epoch": 0.17, "grad_norm": 2.828125, "grad_norm_var": 0.3333943684895833, "learning_rate": 0.0001, "loss": 7.9196, "loss/crossentropy": 2.477718949317932, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25135859847068787, "step": 2720 }, { "epoch": 0.170125, "grad_norm": 2.40625, "grad_norm_var": 0.3395792643229167, "learning_rate": 0.0001, "loss": 7.8171, "loss/crossentropy": 2.2033207416534424, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24111789464950562, "step": 2722 }, { "epoch": 0.17025, "grad_norm": 2.671875, "grad_norm_var": 0.3217844645182292, "learning_rate": 0.0001, "loss": 7.7115, "loss/crossentropy": 2.3668792247772217, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2309272140264511, "step": 2724 }, { "epoch": 0.170375, "grad_norm": 2.609375, "grad_norm_var": 0.32696940104166666, "learning_rate": 0.0001, "loss": 7.807, "loss/crossentropy": 2.359344244003296, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.25724074244499207, "step": 2726 }, { "epoch": 0.1705, "grad_norm": 2.4375, "grad_norm_var": 0.21409505208333332, "learning_rate": 0.0001, "loss": 7.7598, "loss/crossentropy": 2.2009201049804688, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23280736804008484, "step": 2728 }, { "epoch": 0.170625, "grad_norm": 2.59375, "grad_norm_var": 0.1070465087890625, "learning_rate": 0.0001, "loss": 7.8024, "loss/crossentropy": 2.1781824827194214, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23048165440559387, "step": 2730 }, { "epoch": 0.17075, "grad_norm": 2.734375, "grad_norm_var": 0.041890462239583336, "learning_rate": 0.0001, "loss": 7.867, "loss/crossentropy": 2.2078417539596558, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24096353352069855, "step": 2732 }, { "epoch": 0.170875, "grad_norm": 2.640625, "grad_norm_var": 0.03166910807291667, "learning_rate": 0.0001, "loss": 7.8545, "loss/crossentropy": 2.3574694395065308, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2407936155796051, "step": 2734 }, { "epoch": 0.171, "grad_norm": 2.5, "grad_norm_var": 0.026285807291666668, "learning_rate": 0.0001, "loss": 7.7336, "loss/crossentropy": 2.3674607276916504, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2338072955608368, "step": 2736 }, { "epoch": 0.171125, "grad_norm": 2.453125, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 7.7552, "loss/crossentropy": 2.1523255109786987, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23672043532133102, "step": 2738 }, { "epoch": 0.17125, "grad_norm": 2.375, "grad_norm_var": 0.026471964518229165, "learning_rate": 0.0001, "loss": 7.6336, "loss/crossentropy": 2.0955100655555725, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2361309677362442, "step": 2740 }, { "epoch": 0.171375, "grad_norm": 3.140625, "grad_norm_var": 0.036432902018229164, "learning_rate": 0.0001, "loss": 7.8829, "loss/crossentropy": 2.509047269821167, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2578771486878395, "step": 2742 }, { "epoch": 0.1715, "grad_norm": 2.484375, "grad_norm_var": 0.03332926432291667, "learning_rate": 0.0001, "loss": 7.7991, "loss/crossentropy": 2.2338361740112305, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26055432856082916, "step": 2744 }, { "epoch": 0.171625, "grad_norm": 2.59375, "grad_norm_var": 0.033967081705729166, "learning_rate": 0.0001, "loss": 7.7437, "loss/crossentropy": 2.1647003889083862, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23635494709014893, "step": 2746 }, { "epoch": 0.17175, "grad_norm": 2.8125, "grad_norm_var": 0.03599344889322917, "learning_rate": 0.0001, "loss": 7.7051, "loss/crossentropy": 2.561861991882324, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2592911720275879, "step": 2748 }, { "epoch": 0.171875, "grad_norm": 2.390625, "grad_norm_var": 0.03669331868489583, "learning_rate": 0.0001, "loss": 7.6254, "loss/crossentropy": 2.082680583000183, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.21699509769678116, "step": 2750 }, { "epoch": 0.172, "grad_norm": 6.8125, "grad_norm_var": 1.1539052327473958, "learning_rate": 0.0001, "loss": 7.8237, "loss/crossentropy": 2.5276395082473755, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2522757425904274, "step": 2752 }, { "epoch": 0.172125, "grad_norm": 5.1875, "grad_norm_var": 6.189676920572917, "learning_rate": 0.0001, "loss": 8.0084, "loss/crossentropy": 2.2055013179779053, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23886322230100632, "step": 2754 }, { "epoch": 0.17225, "grad_norm": 2.375, "grad_norm_var": 6.181050618489583, "learning_rate": 0.0001, "loss": 7.9047, "loss/crossentropy": 2.394223690032959, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2515896260738373, "step": 2756 }, { "epoch": 0.172375, "grad_norm": 2.90625, "grad_norm_var": 6.165526326497396, "learning_rate": 0.0001, "loss": 7.9735, "loss/crossentropy": 2.2563817501068115, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23824837803840637, "step": 2758 }, { "epoch": 0.1725, "grad_norm": 2.484375, "grad_norm_var": 6.159468587239584, "learning_rate": 0.0001, "loss": 7.8461, "loss/crossentropy": 2.244271457195282, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22762248665094376, "step": 2760 }, { "epoch": 0.172625, "grad_norm": 2.609375, "grad_norm_var": 6.164518229166666, "learning_rate": 0.0001, "loss": 7.7406, "loss/crossentropy": 2.183770179748535, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.21703775227069855, "step": 2762 }, { "epoch": 0.17275, "grad_norm": 3.5625, "grad_norm_var": 6.166910807291667, "learning_rate": 0.0001, "loss": 7.9424, "loss/crossentropy": 2.1734741926193237, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2613416016101837, "step": 2764 }, { "epoch": 0.172875, "grad_norm": 2.859375, "grad_norm_var": 6.048502604166667, "learning_rate": 0.0001, "loss": 7.9174, "loss/crossentropy": 2.32150936126709, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23922567069530487, "step": 2766 }, { "epoch": 0.173, "grad_norm": 2.65625, "grad_norm_var": 5.403804524739583, "learning_rate": 0.0001, "loss": 7.6948, "loss/crossentropy": 2.0935378074645996, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23874947428703308, "step": 2768 }, { "epoch": 0.173125, "grad_norm": 2.703125, "grad_norm_var": 0.39658915201822914, "learning_rate": 0.0001, "loss": 7.9079, "loss/crossentropy": 2.2263898253440857, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24427123367786407, "step": 2770 }, { "epoch": 0.17325, "grad_norm": 2.625, "grad_norm_var": 0.37922261555989584, "learning_rate": 0.0001, "loss": 7.8895, "loss/crossentropy": 2.1437301635742188, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22952381521463394, "step": 2772 }, { "epoch": 0.173375, "grad_norm": 2.4375, "grad_norm_var": 0.40615946451822915, "learning_rate": 0.0001, "loss": 7.8372, "loss/crossentropy": 2.293881416320801, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2393340766429901, "step": 2774 }, { "epoch": 0.1735, "grad_norm": 2.640625, "grad_norm_var": 0.4012980143229167, "learning_rate": 0.0001, "loss": 7.7442, "loss/crossentropy": 2.258249878883362, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23926617950201035, "step": 2776 }, { "epoch": 0.173625, "grad_norm": 3.953125, "grad_norm_var": 0.4626210530598958, "learning_rate": 0.0001, "loss": 7.7726, "loss/crossentropy": 2.0506081581115723, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2070065289735794, "step": 2778 }, { "epoch": 0.17375, "grad_norm": 2.5625, "grad_norm_var": 0.14410400390625, "learning_rate": 0.0001, "loss": 7.7289, "loss/crossentropy": 2.130703330039978, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23245615512132645, "step": 2780 }, { "epoch": 0.173875, "grad_norm": 2.578125, "grad_norm_var": 0.1447906494140625, "learning_rate": 0.0001, "loss": 7.8126, "loss/crossentropy": 2.336124062538147, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2596941590309143, "step": 2782 }, { "epoch": 0.174, "grad_norm": 2.484375, "grad_norm_var": 0.13846028645833333, "learning_rate": 0.0001, "loss": 7.7806, "loss/crossentropy": 2.313853621482849, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24538902938365936, "step": 2784 }, { "epoch": 0.174125, "grad_norm": 2.796875, "grad_norm_var": 0.13967997233072918, "learning_rate": 0.0001, "loss": 7.7471, "loss/crossentropy": 2.2884416580200195, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24766312539577484, "step": 2786 }, { "epoch": 0.17425, "grad_norm": 2.609375, "grad_norm_var": 0.14446512858072916, "learning_rate": 0.0001, "loss": 7.8361, "loss/crossentropy": 2.3857977390289307, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2376108318567276, "step": 2788 }, { "epoch": 0.174375, "grad_norm": 2.46875, "grad_norm_var": 0.14230855305989584, "learning_rate": 0.0001, "loss": 7.7924, "loss/crossentropy": 2.2684574127197266, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25091154873371124, "step": 2790 }, { "epoch": 0.1745, "grad_norm": 2.671875, "grad_norm_var": 0.1416168212890625, "learning_rate": 0.0001, "loss": 7.7769, "loss/crossentropy": 2.2278876304626465, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2252519577741623, "step": 2792 }, { "epoch": 0.174625, "grad_norm": 2.4375, "grad_norm_var": 0.03281962076822917, "learning_rate": 0.0001, "loss": 7.5981, "loss/crossentropy": 2.2489346265792847, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23500560969114304, "step": 2794 }, { "epoch": 0.17475, "grad_norm": 2.5, "grad_norm_var": 0.022248331705729166, "learning_rate": 0.0001, "loss": 7.7545, "loss/crossentropy": 2.295899510383606, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2576214596629143, "step": 2796 }, { "epoch": 0.174875, "grad_norm": 2.46875, "grad_norm_var": 0.0241851806640625, "learning_rate": 0.0001, "loss": 7.8388, "loss/crossentropy": 2.3035519123077393, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2654409855604172, "step": 2798 }, { "epoch": 0.175, "grad_norm": 3.015625, "grad_norm_var": 0.037060546875, "learning_rate": 0.0001, "loss": 7.9235, "loss/crossentropy": 2.2352579832077026, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2841232195496559, "step": 2800 }, { "epoch": 0.175125, "grad_norm": 2.890625, "grad_norm_var": 0.046483357747395836, "learning_rate": 0.0001, "loss": 7.709, "loss/crossentropy": 2.3198055028915405, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23578546196222305, "step": 2802 }, { "epoch": 0.17525, "grad_norm": 3.15625, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 7.8402, "loss/crossentropy": 2.2245510816574097, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23328936845064163, "step": 2804 }, { "epoch": 0.175375, "grad_norm": 2.703125, "grad_norm_var": 0.13108317057291666, "learning_rate": 0.0001, "loss": 7.7222, "loss/crossentropy": 2.3215973377227783, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2323600873351097, "step": 2806 }, { "epoch": 0.1755, "grad_norm": 2.53125, "grad_norm_var": 0.13945210774739583, "learning_rate": 0.0001, "loss": 7.9641, "loss/crossentropy": 2.6317641735076904, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25842827558517456, "step": 2808 }, { "epoch": 0.175625, "grad_norm": 2.46875, "grad_norm_var": 0.13935139973958333, "learning_rate": 0.0001, "loss": 7.82, "loss/crossentropy": 2.215041160583496, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25014058500528336, "step": 2810 }, { "epoch": 0.17575, "grad_norm": 2.4375, "grad_norm_var": 0.13599853515625, "learning_rate": 0.0001, "loss": 7.7395, "loss/crossentropy": 2.1867226362228394, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23424651473760605, "step": 2812 }, { "epoch": 0.175875, "grad_norm": 2.28125, "grad_norm_var": 0.1615631103515625, "learning_rate": 0.0001, "loss": 7.5309, "loss/crossentropy": 2.217566967010498, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.226304829120636, "step": 2814 }, { "epoch": 0.176, "grad_norm": 2.875, "grad_norm_var": 0.15181884765625, "learning_rate": 0.0001, "loss": 7.759, "loss/crossentropy": 2.0618727803230286, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2439548224210739, "step": 2816 }, { "epoch": 0.176125, "grad_norm": 3.515625, "grad_norm_var": 0.6506795247395833, "learning_rate": 0.0001, "loss": 7.8753, "loss/crossentropy": 2.409805655479431, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2505335807800293, "step": 2818 }, { "epoch": 0.17625, "grad_norm": 3.03125, "grad_norm_var": 0.6482248942057292, "learning_rate": 0.0001, "loss": 7.5632, "loss/crossentropy": 2.03094744682312, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21356388926506042, "step": 2820 }, { "epoch": 0.176375, "grad_norm": 2.265625, "grad_norm_var": 0.61324462890625, "learning_rate": 0.0001, "loss": 7.6257, "loss/crossentropy": 2.3141270875930786, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25053539127111435, "step": 2822 }, { "epoch": 0.1765, "grad_norm": 2.71875, "grad_norm_var": 0.60732421875, "learning_rate": 0.0001, "loss": 7.8661, "loss/crossentropy": 2.174069106578827, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.22884509712457657, "step": 2824 }, { "epoch": 0.176625, "grad_norm": 2.625, "grad_norm_var": 0.6008626302083333, "learning_rate": 0.0001, "loss": 7.5315, "loss/crossentropy": 2.08145010471344, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22816456109285355, "step": 2826 }, { "epoch": 0.17675, "grad_norm": 2.375, "grad_norm_var": 0.6063313802083333, "learning_rate": 0.0001, "loss": 7.6201, "loss/crossentropy": 2.094637870788574, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2243911251425743, "step": 2828 }, { "epoch": 0.176875, "grad_norm": 2.53125, "grad_norm_var": 0.5699940999348958, "learning_rate": 0.0001, "loss": 7.7378, "loss/crossentropy": 2.1945928931236267, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2399984896183014, "step": 2830 }, { "epoch": 0.177, "grad_norm": 2.5, "grad_norm_var": 0.5867421468098958, "learning_rate": 0.0001, "loss": 7.6547, "loss/crossentropy": 2.3012553453445435, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23795722424983978, "step": 2832 }, { "epoch": 0.177125, "grad_norm": 2.5, "grad_norm_var": 0.0286773681640625, "learning_rate": 0.0001, "loss": 7.6384, "loss/crossentropy": 2.424581289291382, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2470892071723938, "step": 2834 }, { "epoch": 0.17725, "grad_norm": 2.671875, "grad_norm_var": 0.0132720947265625, "learning_rate": 0.0001, "loss": 7.7292, "loss/crossentropy": 2.136981964111328, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23704643547534943, "step": 2836 }, { "epoch": 0.177375, "grad_norm": 2.421875, "grad_norm_var": 0.01051025390625, "learning_rate": 0.0001, "loss": 7.8595, "loss/crossentropy": 2.285550355911255, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23677106201648712, "step": 2838 }, { "epoch": 0.1775, "grad_norm": 2.859375, "grad_norm_var": 0.015262858072916666, "learning_rate": 0.0001, "loss": 7.6297, "loss/crossentropy": 2.1494513750076294, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2412027269601822, "step": 2840 }, { "epoch": 0.177625, "grad_norm": 3.0, "grad_norm_var": 0.026984659830729167, "learning_rate": 0.0001, "loss": 7.7216, "loss/crossentropy": 2.605257034301758, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25743308663368225, "step": 2842 }, { "epoch": 0.17775, "grad_norm": 2.5, "grad_norm_var": 0.028539021809895832, "learning_rate": 0.0001, "loss": 7.6461, "loss/crossentropy": 2.0102819204330444, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2197662517428398, "step": 2844 }, { "epoch": 0.177875, "grad_norm": 2.6875, "grad_norm_var": 0.028564453125, "learning_rate": 0.0001, "loss": 7.8794, "loss/crossentropy": 2.154181718826294, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23210670053958893, "step": 2846 }, { "epoch": 0.178, "grad_norm": 2.421875, "grad_norm_var": 0.0362945556640625, "learning_rate": 0.0001, "loss": 7.8658, "loss/crossentropy": 2.0595306158065796, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.26342423260211945, "step": 2848 }, { "epoch": 0.178125, "grad_norm": 2.578125, "grad_norm_var": 0.03465067545572917, "learning_rate": 0.0001, "loss": 7.5678, "loss/crossentropy": 2.179203987121582, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22831647098064423, "step": 2850 }, { "epoch": 0.17825, "grad_norm": 2.875, "grad_norm_var": 0.0369781494140625, "learning_rate": 0.0001, "loss": 7.6801, "loss/crossentropy": 2.0674314498901367, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23797215521335602, "step": 2852 }, { "epoch": 0.178375, "grad_norm": 2.765625, "grad_norm_var": 0.0592681884765625, "learning_rate": 0.0001, "loss": 7.9571, "loss/crossentropy": 2.3534988164901733, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26077476143836975, "step": 2854 }, { "epoch": 0.1785, "grad_norm": 2.578125, "grad_norm_var": 0.07317708333333334, "learning_rate": 0.0001, "loss": 7.6968, "loss/crossentropy": 2.10899019241333, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22398869693279266, "step": 2856 }, { "epoch": 0.178625, "grad_norm": 2.4375, "grad_norm_var": 0.07069905598958333, "learning_rate": 0.0001, "loss": 7.8299, "loss/crossentropy": 2.319381833076477, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24246351420879364, "step": 2858 }, { "epoch": 0.17875, "grad_norm": 2.484375, "grad_norm_var": 0.07053934733072917, "learning_rate": 0.0001, "loss": 7.6359, "loss/crossentropy": 2.083498954772949, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23692196607589722, "step": 2860 }, { "epoch": 0.178875, "grad_norm": 2.359375, "grad_norm_var": 0.0753814697265625, "learning_rate": 0.0001, "loss": 7.8099, "loss/crossentropy": 2.1947683095932007, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24239200353622437, "step": 2862 }, { "epoch": 0.179, "grad_norm": 2.671875, "grad_norm_var": 0.06674702962239583, "learning_rate": 0.0001, "loss": 7.7765, "loss/crossentropy": 2.209209442138672, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24729400128126144, "step": 2864 }, { "epoch": 0.179125, "grad_norm": 2.53125, "grad_norm_var": 0.0657135009765625, "learning_rate": 0.0001, "loss": 7.8832, "loss/crossentropy": 2.3722325563430786, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24714642018079758, "step": 2866 }, { "epoch": 0.17925, "grad_norm": 2.578125, "grad_norm_var": 0.07121988932291666, "learning_rate": 0.0001, "loss": 7.7811, "loss/crossentropy": 2.4134687185287476, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2480832040309906, "step": 2868 }, { "epoch": 0.179375, "grad_norm": 2.625, "grad_norm_var": 0.0347320556640625, "learning_rate": 0.0001, "loss": 7.7289, "loss/crossentropy": 2.2061760425567627, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23551230877637863, "step": 2870 }, { "epoch": 0.1795, "grad_norm": 2.5625, "grad_norm_var": 0.030790201822916665, "learning_rate": 0.0001, "loss": 7.8462, "loss/crossentropy": 2.436152458190918, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2384696677327156, "step": 2872 }, { "epoch": 0.179625, "grad_norm": 2.375, "grad_norm_var": 0.03289388020833333, "learning_rate": 0.0001, "loss": 7.7489, "loss/crossentropy": 2.3130866289138794, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.27254779636859894, "step": 2874 }, { "epoch": 0.17975, "grad_norm": 2.828125, "grad_norm_var": 0.0348052978515625, "learning_rate": 0.0001, "loss": 7.8657, "loss/crossentropy": 2.294617772102356, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23688946664333344, "step": 2876 }, { "epoch": 0.179875, "grad_norm": 2.5625, "grad_norm_var": 0.030497233072916668, "learning_rate": 0.0001, "loss": 7.7343, "loss/crossentropy": 2.3553664684295654, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2647460997104645, "step": 2878 }, { "epoch": 0.18, "grad_norm": 2.734375, "grad_norm_var": 0.031180826822916667, "learning_rate": 0.0001, "loss": 7.7173, "loss/crossentropy": 2.3102493286132812, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23434799164533615, "step": 2880 }, { "epoch": 0.180125, "grad_norm": 2.5, "grad_norm_var": 0.030378214518229165, "learning_rate": 0.0001, "loss": 7.7913, "loss/crossentropy": 2.298587918281555, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22284646332263947, "step": 2882 }, { "epoch": 0.18025, "grad_norm": 2.78125, "grad_norm_var": 0.024128214518229166, "learning_rate": 0.0001, "loss": 7.6415, "loss/crossentropy": 2.1285579204559326, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23009717464447021, "step": 2884 }, { "epoch": 0.180375, "grad_norm": 2.546875, "grad_norm_var": 0.021419270833333334, "learning_rate": 0.0001, "loss": 7.7626, "loss/crossentropy": 2.1065726280212402, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.20703819394111633, "step": 2886 }, { "epoch": 0.1805, "grad_norm": 2.734375, "grad_norm_var": 0.0209625244140625, "learning_rate": 0.0001, "loss": 7.8513, "loss/crossentropy": 2.2261340618133545, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25797754526138306, "step": 2888 }, { "epoch": 0.180625, "grad_norm": 2.421875, "grad_norm_var": 0.0176910400390625, "learning_rate": 0.0001, "loss": 7.8405, "loss/crossentropy": 2.5347334146499634, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25483807921409607, "step": 2890 }, { "epoch": 0.18075, "grad_norm": 2.5625, "grad_norm_var": 0.016141764322916665, "learning_rate": 0.0001, "loss": 7.8018, "loss/crossentropy": 2.2389495372772217, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2264159545302391, "step": 2892 }, { "epoch": 0.180875, "grad_norm": 2.453125, "grad_norm_var": 0.017552693684895832, "learning_rate": 0.0001, "loss": 7.8118, "loss/crossentropy": 2.2665982246398926, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24241438508033752, "step": 2894 }, { "epoch": 0.181, "grad_norm": 2.5625, "grad_norm_var": 0.017430623372395832, "learning_rate": 0.0001, "loss": 7.7171, "loss/crossentropy": 2.0678027272224426, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21813754737377167, "step": 2896 }, { "epoch": 0.181125, "grad_norm": 2.59375, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 7.7695, "loss/crossentropy": 2.3117023706436157, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23616492748260498, "step": 2898 }, { "epoch": 0.18125, "grad_norm": 2.5625, "grad_norm_var": 0.016434733072916666, "learning_rate": 0.0001, "loss": 7.8045, "loss/crossentropy": 2.30752170085907, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22399520874023438, "step": 2900 }, { "epoch": 0.181375, "grad_norm": 2.421875, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 8.0111, "loss/crossentropy": 2.441025972366333, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25707243382930756, "step": 2902 }, { "epoch": 0.1815, "grad_norm": 2.421875, "grad_norm_var": 0.012848917643229167, "learning_rate": 0.0001, "loss": 7.6732, "loss/crossentropy": 2.146459937095642, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24353434145450592, "step": 2904 }, { "epoch": 0.181625, "grad_norm": 2.8125, "grad_norm_var": 0.017609659830729166, "learning_rate": 0.0001, "loss": 7.7333, "loss/crossentropy": 2.4673913717269897, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2522246688604355, "step": 2906 }, { "epoch": 0.18175, "grad_norm": 2.40625, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 7.7385, "loss/crossentropy": 2.1044957637786865, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.235269233584404, "step": 2908 }, { "epoch": 0.181875, "grad_norm": 2.328125, "grad_norm_var": 0.022347005208333333, "learning_rate": 0.0001, "loss": 7.4037, "loss/crossentropy": 2.2339547872543335, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2398391216993332, "step": 2910 }, { "epoch": 0.182, "grad_norm": 2.6875, "grad_norm_var": 0.0226715087890625, "learning_rate": 0.0001, "loss": 7.5718, "loss/crossentropy": 2.1950976848602295, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23091710358858109, "step": 2912 }, { "epoch": 0.182125, "grad_norm": 2.640625, "grad_norm_var": 0.02310791015625, "learning_rate": 0.0001, "loss": 7.6153, "loss/crossentropy": 2.3033652305603027, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26021257042884827, "step": 2914 }, { "epoch": 0.18225, "grad_norm": 2.4375, "grad_norm_var": 0.017838541666666666, "learning_rate": 0.0001, "loss": 7.8996, "loss/crossentropy": 2.4616453647613525, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2583332806825638, "step": 2916 }, { "epoch": 0.182375, "grad_norm": 2.484375, "grad_norm_var": 0.018456013997395833, "learning_rate": 0.0001, "loss": 7.6534, "loss/crossentropy": 2.3851388692855835, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24298951029777527, "step": 2918 }, { "epoch": 0.1825, "grad_norm": 2.515625, "grad_norm_var": 0.016890462239583334, "learning_rate": 0.0001, "loss": 7.6927, "loss/crossentropy": 2.409003496170044, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22873351722955704, "step": 2920 }, { "epoch": 0.182625, "grad_norm": 2.828125, "grad_norm_var": 0.03720703125, "learning_rate": 0.0001, "loss": 7.7708, "loss/crossentropy": 2.233977437019348, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.250872403383255, "step": 2922 }, { "epoch": 0.18275, "grad_norm": 2.296875, "grad_norm_var": 0.03955790201822917, "learning_rate": 0.0001, "loss": 7.8246, "loss/crossentropy": 2.2683684825897217, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2577130198478699, "step": 2924 }, { "epoch": 0.182875, "grad_norm": 2.734375, "grad_norm_var": 0.04023030598958333, "learning_rate": 0.0001, "loss": 7.7134, "loss/crossentropy": 2.2642526626586914, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25480280816555023, "step": 2926 }, { "epoch": 0.183, "grad_norm": 2.546875, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 7.8294, "loss/crossentropy": 2.176198959350586, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22762203961610794, "step": 2928 }, { "epoch": 0.183125, "grad_norm": 2.65625, "grad_norm_var": 0.037007649739583336, "learning_rate": 0.0001, "loss": 7.7678, "loss/crossentropy": 2.5741217136383057, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24319174140691757, "step": 2930 }, { "epoch": 0.18325, "grad_norm": 2.765625, "grad_norm_var": 0.03632405598958333, "learning_rate": 0.0001, "loss": 7.744, "loss/crossentropy": 2.4425946474075317, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2534085810184479, "step": 2932 }, { "epoch": 0.183375, "grad_norm": 2.65625, "grad_norm_var": 0.0351470947265625, "learning_rate": 0.0001, "loss": 7.6372, "loss/crossentropy": 2.136154890060425, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23775418102741241, "step": 2934 }, { "epoch": 0.1835, "grad_norm": 2.5, "grad_norm_var": 0.035380045572916664, "learning_rate": 0.0001, "loss": 7.8286, "loss/crossentropy": 2.249310851097107, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24401143193244934, "step": 2936 }, { "epoch": 0.183625, "grad_norm": 2.71875, "grad_norm_var": 0.018912760416666667, "learning_rate": 0.0001, "loss": 7.8273, "loss/crossentropy": 2.3503148555755615, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24899056553840637, "step": 2938 }, { "epoch": 0.18375, "grad_norm": 2.546875, "grad_norm_var": 0.012923177083333333, "learning_rate": 0.0001, "loss": 7.5477, "loss/crossentropy": 2.147356152534485, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24195496737957, "step": 2940 }, { "epoch": 0.183875, "grad_norm": 2.3125, "grad_norm_var": 0.017496744791666668, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.314660429954529, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24584651738405228, "step": 2942 }, { "epoch": 0.184, "grad_norm": 2.796875, "grad_norm_var": 0.020829264322916666, "learning_rate": 0.0001, "loss": 7.6413, "loss/crossentropy": 2.0859320759773254, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2166888415813446, "step": 2944 }, { "epoch": 0.184125, "grad_norm": 2.234375, "grad_norm_var": 0.028837076822916665, "learning_rate": 0.0001, "loss": 7.7158, "loss/crossentropy": 2.305862069129944, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23023030161857605, "step": 2946 }, { "epoch": 0.18425, "grad_norm": 2.609375, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 7.635, "loss/crossentropy": 2.345908284187317, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22712141275405884, "step": 2948 }, { "epoch": 0.184375, "grad_norm": 2.90625, "grad_norm_var": 0.03779296875, "learning_rate": 0.0001, "loss": 7.7227, "loss/crossentropy": 1.9320513010025024, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.20210829377174377, "step": 2950 }, { "epoch": 0.1845, "grad_norm": 2.265625, "grad_norm_var": 0.04317118326822917, "learning_rate": 0.0001, "loss": 7.6749, "loss/crossentropy": 2.2975679636001587, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23657388985157013, "step": 2952 }, { "epoch": 0.184625, "grad_norm": 2.6875, "grad_norm_var": 0.04095052083333333, "learning_rate": 0.0001, "loss": 7.7486, "loss/crossentropy": 2.2370001077651978, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24035517871379852, "step": 2954 }, { "epoch": 0.18475, "grad_norm": 2.46875, "grad_norm_var": 0.046418253580729166, "learning_rate": 0.0001, "loss": 7.77, "loss/crossentropy": 2.3344703912734985, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24448612332344055, "step": 2956 }, { "epoch": 0.184875, "grad_norm": 2.671875, "grad_norm_var": 0.0432037353515625, "learning_rate": 0.0001, "loss": 7.7287, "loss/crossentropy": 2.3184186220169067, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24355477839708328, "step": 2958 }, { "epoch": 0.185, "grad_norm": 2.671875, "grad_norm_var": 0.04150390625, "learning_rate": 0.0001, "loss": 7.793, "loss/crossentropy": 2.4449312686920166, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24685372412204742, "step": 2960 }, { "epoch": 0.185125, "grad_norm": 2.5, "grad_norm_var": 0.0351226806640625, "learning_rate": 0.0001, "loss": 7.7295, "loss/crossentropy": 2.0395036935806274, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22259121388196945, "step": 2962 }, { "epoch": 0.18525, "grad_norm": 2.5625, "grad_norm_var": 0.0314453125, "learning_rate": 0.0001, "loss": 7.6614, "loss/crossentropy": 2.029510021209717, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22326287627220154, "step": 2964 }, { "epoch": 0.185375, "grad_norm": 2.65625, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 7.6142, "loss/crossentropy": 2.2890524864196777, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22059021890163422, "step": 2966 }, { "epoch": 0.1855, "grad_norm": 2.578125, "grad_norm_var": 0.021434529622395834, "learning_rate": 0.0001, "loss": 7.7904, "loss/crossentropy": 2.2007906436920166, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22886135429143906, "step": 2968 }, { "epoch": 0.185625, "grad_norm": 2.578125, "grad_norm_var": 0.016097005208333334, "learning_rate": 0.0001, "loss": 7.7165, "loss/crossentropy": 2.4090301990509033, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24099014699459076, "step": 2970 }, { "epoch": 0.18575, "grad_norm": 2.34375, "grad_norm_var": 0.014225260416666666, "learning_rate": 0.0001, "loss": 7.7287, "loss/crossentropy": 2.358201503753662, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2252245992422104, "step": 2972 }, { "epoch": 0.185875, "grad_norm": 2.71875, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 7.623, "loss/crossentropy": 2.3175272941589355, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23873913288116455, "step": 2974 }, { "epoch": 0.186, "grad_norm": 2.5625, "grad_norm_var": 0.0131988525390625, "learning_rate": 0.0001, "loss": 7.6915, "loss/crossentropy": 2.590595841407776, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24967636168003082, "step": 2976 }, { "epoch": 0.186125, "grad_norm": 2.484375, "grad_norm_var": 0.015306599934895833, "learning_rate": 0.0001, "loss": 7.6723, "loss/crossentropy": 2.2069579362869263, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22214916348457336, "step": 2978 }, { "epoch": 0.18625, "grad_norm": 2.65625, "grad_norm_var": 0.015729777018229165, "learning_rate": 0.0001, "loss": 7.8361, "loss/crossentropy": 2.3302581310272217, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2437615841627121, "step": 2980 }, { "epoch": 0.186375, "grad_norm": 2.453125, "grad_norm_var": 0.014232381184895834, "learning_rate": 0.0001, "loss": 7.6333, "loss/crossentropy": 2.2672786712646484, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22940023988485336, "step": 2982 }, { "epoch": 0.1865, "grad_norm": 2.65625, "grad_norm_var": 0.014290364583333333, "learning_rate": 0.0001, "loss": 7.7551, "loss/crossentropy": 2.528477191925049, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24014821648597717, "step": 2984 }, { "epoch": 0.186625, "grad_norm": 2.796875, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.6699, "loss/crossentropy": 2.134658455848694, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2487226352095604, "step": 2986 }, { "epoch": 0.18675, "grad_norm": 2.34375, "grad_norm_var": 0.0167388916015625, "learning_rate": 0.0001, "loss": 7.6979, "loss/crossentropy": 2.3620848655700684, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24307234585285187, "step": 2988 }, { "epoch": 0.186875, "grad_norm": 2.671875, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 7.7099, "loss/crossentropy": 2.4233195781707764, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2325136736035347, "step": 2990 }, { "epoch": 0.187, "grad_norm": 2.515625, "grad_norm_var": 0.015462239583333334, "learning_rate": 0.0001, "loss": 7.6485, "loss/crossentropy": 2.2925750017166138, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24893249571323395, "step": 2992 }, { "epoch": 0.187125, "grad_norm": 4.125, "grad_norm_var": 0.16298421223958334, "learning_rate": 0.0001, "loss": 7.7527, "loss/crossentropy": 2.1467760801315308, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23566482961177826, "step": 2994 }, { "epoch": 0.18725, "grad_norm": 2.4375, "grad_norm_var": 0.16752827962239583, "learning_rate": 0.0001, "loss": 7.7408, "loss/crossentropy": 2.103184461593628, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2323935329914093, "step": 2996 }, { "epoch": 0.187375, "grad_norm": 2.65625, "grad_norm_var": 0.16705322265625, "learning_rate": 0.0001, "loss": 7.7494, "loss/crossentropy": 2.174781620502472, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25046705454587936, "step": 2998 }, { "epoch": 0.1875, "grad_norm": 2.625, "grad_norm_var": 0.16780192057291668, "learning_rate": 0.0001, "loss": 7.7174, "loss/crossentropy": 2.2673741579055786, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22653405368328094, "step": 3000 }, { "epoch": 0.187625, "grad_norm": 3.296875, "grad_norm_var": 0.19199117024739584, "learning_rate": 0.0001, "loss": 7.7319, "loss/crossentropy": 2.157706141471863, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2287752330303192, "step": 3002 }, { "epoch": 0.18775, "grad_norm": 3.0625, "grad_norm_var": 0.2681223551432292, "learning_rate": 0.0001, "loss": 7.8417, "loss/crossentropy": 2.0640329122543335, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2371016889810562, "step": 3004 }, { "epoch": 0.187875, "grad_norm": 2.328125, "grad_norm_var": 0.28884989420572915, "learning_rate": 0.0001, "loss": 7.4793, "loss/crossentropy": 2.1649757027626038, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.22954751551151276, "step": 3006 }, { "epoch": 0.188, "grad_norm": 2.9375, "grad_norm_var": 0.2865397135416667, "learning_rate": 0.0001, "loss": 7.7681, "loss/crossentropy": 2.402653217315674, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2439199835062027, "step": 3008 }, { "epoch": 0.188125, "grad_norm": 2.671875, "grad_norm_var": 0.16442057291666667, "learning_rate": 0.0001, "loss": 7.7058, "loss/crossentropy": 2.0360541343688965, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2370767444372177, "step": 3010 }, { "epoch": 0.18825, "grad_norm": 2.78125, "grad_norm_var": 0.16503499348958334, "learning_rate": 0.0001, "loss": 7.78, "loss/crossentropy": 2.5376839637756348, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25698722898960114, "step": 3012 }, { "epoch": 0.188375, "grad_norm": 2.546875, "grad_norm_var": 0.16298421223958334, "learning_rate": 0.0001, "loss": 7.6674, "loss/crossentropy": 2.153092384338379, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22976408153772354, "step": 3014 }, { "epoch": 0.1885, "grad_norm": 2.421875, "grad_norm_var": 0.16728413899739583, "learning_rate": 0.0001, "loss": 7.9659, "loss/crossentropy": 2.4408687353134155, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2630517780780792, "step": 3016 }, { "epoch": 0.188625, "grad_norm": 2.546875, "grad_norm_var": 0.15907796223958334, "learning_rate": 0.0001, "loss": 7.6187, "loss/crossentropy": 2.1757636070251465, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2302849441766739, "step": 3018 }, { "epoch": 0.18875, "grad_norm": 2.71875, "grad_norm_var": 0.03792215983072917, "learning_rate": 0.0001, "loss": 7.5604, "loss/crossentropy": 2.064240336418152, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22439640015363693, "step": 3020 }, { "epoch": 0.188875, "grad_norm": 2.46875, "grad_norm_var": 0.03224995930989583, "learning_rate": 0.0001, "loss": 7.6954, "loss/crossentropy": 2.1221320629119873, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22975972294807434, "step": 3022 }, { "epoch": 0.189, "grad_norm": 2.59375, "grad_norm_var": 0.023714192708333335, "learning_rate": 0.0001, "loss": 7.8211, "loss/crossentropy": 2.243198275566101, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2282658889889717, "step": 3024 }, { "epoch": 0.189125, "grad_norm": 2.5, "grad_norm_var": 0.0191558837890625, "learning_rate": 0.0001, "loss": 7.6639, "loss/crossentropy": 1.9989042282104492, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23544684797525406, "step": 3026 }, { "epoch": 0.18925, "grad_norm": 2.671875, "grad_norm_var": 0.015510050455729167, "learning_rate": 0.0001, "loss": 7.6059, "loss/crossentropy": 2.0583502054214478, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22483647614717484, "step": 3028 }, { "epoch": 0.189375, "grad_norm": 2.5625, "grad_norm_var": 0.019481404622395834, "learning_rate": 0.0001, "loss": 7.5609, "loss/crossentropy": 2.213624954223633, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23435892909765244, "step": 3030 }, { "epoch": 0.1895, "grad_norm": 2.625, "grad_norm_var": 0.015445963541666666, "learning_rate": 0.0001, "loss": 7.7361, "loss/crossentropy": 2.2522560358047485, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24112869054079056, "step": 3032 }, { "epoch": 0.189625, "grad_norm": 2.453125, "grad_norm_var": 0.011865234375, "learning_rate": 0.0001, "loss": 7.9651, "loss/crossentropy": 2.325987696647644, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22860444337129593, "step": 3034 }, { "epoch": 0.18975, "grad_norm": 2.75, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 7.6774, "loss/crossentropy": 2.292188882827759, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24093221873044968, "step": 3036 }, { "epoch": 0.189875, "grad_norm": 2.859375, "grad_norm_var": 0.020677693684895835, "learning_rate": 0.0001, "loss": 7.7072, "loss/crossentropy": 2.1392345428466797, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2271246463060379, "step": 3038 }, { "epoch": 0.19, "grad_norm": 2.328125, "grad_norm_var": 0.025584920247395834, "learning_rate": 0.0001, "loss": 7.7131, "loss/crossentropy": 2.3634947538375854, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2574180141091347, "step": 3040 }, { "epoch": 0.190125, "grad_norm": 2.5, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 7.6816, "loss/crossentropy": 2.224321484565735, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22878948599100113, "step": 3042 }, { "epoch": 0.19025, "grad_norm": 2.65625, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 7.7158, "loss/crossentropy": 2.471584916114807, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2526979222893715, "step": 3044 }, { "epoch": 0.190375, "grad_norm": 2.703125, "grad_norm_var": 0.02271728515625, "learning_rate": 0.0001, "loss": 7.4846, "loss/crossentropy": 2.2407480478286743, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23275888711214066, "step": 3046 }, { "epoch": 0.1905, "grad_norm": 2.484375, "grad_norm_var": 0.023140462239583333, "learning_rate": 0.0001, "loss": 7.7875, "loss/crossentropy": 2.2426388263702393, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23194129765033722, "step": 3048 }, { "epoch": 0.190625, "grad_norm": 2.921875, "grad_norm_var": 0.029488118489583333, "learning_rate": 0.0001, "loss": 7.6974, "loss/crossentropy": 2.3478230237960815, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24639128148555756, "step": 3050 }, { "epoch": 0.19075, "grad_norm": 2.390625, "grad_norm_var": 0.030973307291666665, "learning_rate": 0.0001, "loss": 7.8786, "loss/crossentropy": 2.3184871673583984, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2819037437438965, "step": 3052 }, { "epoch": 0.190875, "grad_norm": 2.59375, "grad_norm_var": 0.025520833333333333, "learning_rate": 0.0001, "loss": 7.6416, "loss/crossentropy": 2.140998363494873, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22248034179210663, "step": 3054 }, { "epoch": 0.191, "grad_norm": 2.40625, "grad_norm_var": 0.024540201822916666, "learning_rate": 0.0001, "loss": 7.7976, "loss/crossentropy": 2.492767333984375, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2436331883072853, "step": 3056 }, { "epoch": 0.191125, "grad_norm": 2.796875, "grad_norm_var": 0.026851399739583334, "learning_rate": 0.0001, "loss": 7.6777, "loss/crossentropy": 1.9927314519882202, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22899659723043442, "step": 3058 }, { "epoch": 0.19125, "grad_norm": 2.5, "grad_norm_var": 0.0286285400390625, "learning_rate": 0.0001, "loss": 7.9659, "loss/crossentropy": 2.2941343784332275, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24661502987146378, "step": 3060 }, { "epoch": 0.191375, "grad_norm": 2.5, "grad_norm_var": 0.02808837890625, "learning_rate": 0.0001, "loss": 7.8332, "loss/crossentropy": 2.2587934732437134, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23479507118463516, "step": 3062 }, { "epoch": 0.1915, "grad_norm": 2.484375, "grad_norm_var": 0.0285797119140625, "learning_rate": 0.0001, "loss": 7.7855, "loss/crossentropy": 2.448140263557434, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24534112215042114, "step": 3064 }, { "epoch": 0.191625, "grad_norm": 2.734375, "grad_norm_var": 0.022802734375, "learning_rate": 0.0001, "loss": 7.6054, "loss/crossentropy": 2.5346790552139282, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2549649178981781, "step": 3066 }, { "epoch": 0.19175, "grad_norm": 2.390625, "grad_norm_var": 0.0201324462890625, "learning_rate": 0.0001, "loss": 7.6652, "loss/crossentropy": 2.139198064804077, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22500670701265335, "step": 3068 }, { "epoch": 0.191875, "grad_norm": 2.5, "grad_norm_var": 0.020052083333333335, "learning_rate": 0.0001, "loss": 7.6343, "loss/crossentropy": 2.2101441621780396, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24504300951957703, "step": 3070 }, { "epoch": 0.192, "grad_norm": 2.78125, "grad_norm_var": 0.018387858072916666, "learning_rate": 0.0001, "loss": 7.8094, "loss/crossentropy": 2.387241005897522, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23291389644145966, "step": 3072 }, { "epoch": 0.192125, "grad_norm": 2.515625, "grad_norm_var": 0.015632120768229167, "learning_rate": 0.0001, "loss": 7.657, "loss/crossentropy": 2.015101671218872, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2168959379196167, "step": 3074 }, { "epoch": 0.19225, "grad_norm": 2.359375, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.201832890510559, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25159038603305817, "step": 3076 }, { "epoch": 0.192375, "grad_norm": 2.640625, "grad_norm_var": 0.017699178059895834, "learning_rate": 0.0001, "loss": 7.7274, "loss/crossentropy": 2.4729052782058716, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2636168450117111, "step": 3078 }, { "epoch": 0.1925, "grad_norm": 2.5625, "grad_norm_var": 0.015868123372395834, "learning_rate": 0.0001, "loss": 7.4003, "loss/crossentropy": 2.096401810646057, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22800646722316742, "step": 3080 }, { "epoch": 0.192625, "grad_norm": 2.71875, "grad_norm_var": 0.017316691080729165, "learning_rate": 0.0001, "loss": 7.5855, "loss/crossentropy": 2.1243752241134644, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2130807489156723, "step": 3082 }, { "epoch": 0.19275, "grad_norm": 2.59375, "grad_norm_var": 0.016258748372395833, "learning_rate": 0.0001, "loss": 7.6234, "loss/crossentropy": 2.393889904022217, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23582585901021957, "step": 3084 }, { "epoch": 0.192875, "grad_norm": 2.453125, "grad_norm_var": 0.0202789306640625, "learning_rate": 0.0001, "loss": 7.7359, "loss/crossentropy": 2.530544877052307, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23919443786144257, "step": 3086 }, { "epoch": 0.193, "grad_norm": 2.453125, "grad_norm_var": 0.018895467122395832, "learning_rate": 0.0001, "loss": 7.7122, "loss/crossentropy": 2.14614474773407, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23280902951955795, "step": 3088 }, { "epoch": 0.193125, "grad_norm": 2.546875, "grad_norm_var": 0.025519816080729167, "learning_rate": 0.0001, "loss": 7.6703, "loss/crossentropy": 2.1698378324508667, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23564688116312027, "step": 3090 }, { "epoch": 0.19325, "grad_norm": 2.296875, "grad_norm_var": 0.0282379150390625, "learning_rate": 0.0001, "loss": 7.5935, "loss/crossentropy": 2.2789262533187866, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23355238884687424, "step": 3092 }, { "epoch": 0.193375, "grad_norm": 2.78125, "grad_norm_var": 0.030013020833333334, "learning_rate": 0.0001, "loss": 7.8383, "loss/crossentropy": 2.548181891441345, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.242641419172287, "step": 3094 }, { "epoch": 0.1935, "grad_norm": 2.4375, "grad_norm_var": 0.031525675455729166, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.34587025642395, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2449270710349083, "step": 3096 }, { "epoch": 0.193625, "grad_norm": 3.015625, "grad_norm_var": 0.04414774576822917, "learning_rate": 0.0001, "loss": 7.5994, "loss/crossentropy": 2.2254514694213867, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2519787400960922, "step": 3098 }, { "epoch": 0.19375, "grad_norm": 2.640625, "grad_norm_var": 0.060301717122395834, "learning_rate": 0.0001, "loss": 7.623, "loss/crossentropy": 2.3490875959396362, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24305754899978638, "step": 3100 }, { "epoch": 0.193875, "grad_norm": 2.34375, "grad_norm_var": 0.0575836181640625, "learning_rate": 0.0001, "loss": 7.7137, "loss/crossentropy": 2.021351933479309, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2211918607354164, "step": 3102 }, { "epoch": 0.194, "grad_norm": 2.8125, "grad_norm_var": 0.060791015625, "learning_rate": 0.0001, "loss": 7.7423, "loss/crossentropy": 2.1594278812408447, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24009329080581665, "step": 3104 }, { "epoch": 0.194125, "grad_norm": 2.578125, "grad_norm_var": 0.059137980143229164, "learning_rate": 0.0001, "loss": 7.7889, "loss/crossentropy": 2.459377884864807, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.251203328371048, "step": 3106 }, { "epoch": 0.19425, "grad_norm": 2.921875, "grad_norm_var": 0.05748291015625, "learning_rate": 0.0001, "loss": 7.5124, "loss/crossentropy": 2.0707362294197083, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21844393759965897, "step": 3108 }, { "epoch": 0.194375, "grad_norm": 2.453125, "grad_norm_var": 0.101025390625, "learning_rate": 0.0001, "loss": 7.6732, "loss/crossentropy": 2.2419523000717163, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2376825362443924, "step": 3110 }, { "epoch": 0.1945, "grad_norm": 2.46875, "grad_norm_var": 0.10181376139322916, "learning_rate": 0.0001, "loss": 7.6479, "loss/crossentropy": 2.3259233236312866, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25602586567401886, "step": 3112 }, { "epoch": 0.194625, "grad_norm": 2.796875, "grad_norm_var": 0.0926910400390625, "learning_rate": 0.0001, "loss": 7.5447, "loss/crossentropy": 1.9941769242286682, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21642977744340897, "step": 3114 }, { "epoch": 0.19475, "grad_norm": 2.578125, "grad_norm_var": 0.09576822916666666, "learning_rate": 0.0001, "loss": 7.8191, "loss/crossentropy": 2.5602025985717773, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25449611991643906, "step": 3116 }, { "epoch": 0.194875, "grad_norm": 2.484375, "grad_norm_var": 0.09041239420572916, "learning_rate": 0.0001, "loss": 7.8136, "loss/crossentropy": 2.3339617252349854, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23222877830266953, "step": 3118 }, { "epoch": 0.195, "grad_norm": 2.453125, "grad_norm_var": 0.09040425618489584, "learning_rate": 0.0001, "loss": 7.7202, "loss/crossentropy": 2.4900479316711426, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24146823585033417, "step": 3120 }, { "epoch": 0.195125, "grad_norm": 2.734375, "grad_norm_var": 0.09036356608072917, "learning_rate": 0.0001, "loss": 7.6583, "loss/crossentropy": 2.191547393798828, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2142081782221794, "step": 3122 }, { "epoch": 0.19525, "grad_norm": 2.375, "grad_norm_var": 0.090185546875, "learning_rate": 0.0001, "loss": 7.6174, "loss/crossentropy": 2.3771393299102783, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2471369206905365, "step": 3124 }, { "epoch": 0.195375, "grad_norm": 2.625, "grad_norm_var": 0.04331766764322917, "learning_rate": 0.0001, "loss": 7.5707, "loss/crossentropy": 2.3110562562942505, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23804593086242676, "step": 3126 }, { "epoch": 0.1955, "grad_norm": 2.5, "grad_norm_var": 0.040379842122395836, "learning_rate": 0.0001, "loss": 7.6465, "loss/crossentropy": 2.0701069831848145, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23578406125307083, "step": 3128 }, { "epoch": 0.195625, "grad_norm": 2.421875, "grad_norm_var": 0.038134765625, "learning_rate": 0.0001, "loss": 7.6923, "loss/crossentropy": 2.2268728017807007, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22668010741472244, "step": 3130 }, { "epoch": 0.19575, "grad_norm": 2.546875, "grad_norm_var": 0.018724568684895835, "learning_rate": 0.0001, "loss": 7.7925, "loss/crossentropy": 2.288881540298462, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2400398850440979, "step": 3132 }, { "epoch": 0.195875, "grad_norm": 2.546875, "grad_norm_var": 0.019310506184895833, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.3762770891189575, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24239980429410934, "step": 3134 }, { "epoch": 0.196, "grad_norm": 2.359375, "grad_norm_var": 0.022037760416666666, "learning_rate": 0.0001, "loss": 7.5924, "loss/crossentropy": 2.0780075788497925, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2166167050600052, "step": 3136 }, { "epoch": 0.196125, "grad_norm": 2.359375, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 7.6296, "loss/crossentropy": 2.2321070432662964, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.257367268204689, "step": 3138 }, { "epoch": 0.19625, "grad_norm": 2.5, "grad_norm_var": 0.0124664306640625, "learning_rate": 0.0001, "loss": 7.655, "loss/crossentropy": 2.2101333141326904, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22183632850646973, "step": 3140 }, { "epoch": 0.196375, "grad_norm": 2.5, "grad_norm_var": 0.013451131184895833, "learning_rate": 0.0001, "loss": 7.7539, "loss/crossentropy": 2.3512450456619263, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2585148215293884, "step": 3142 }, { "epoch": 0.1965, "grad_norm": 2.578125, "grad_norm_var": 0.012360636393229167, "learning_rate": 0.0001, "loss": 7.6486, "loss/crossentropy": 2.224077582359314, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23895263671875, "step": 3144 }, { "epoch": 0.196625, "grad_norm": 2.40625, "grad_norm_var": 0.01246337890625, "learning_rate": 0.0001, "loss": 7.5564, "loss/crossentropy": 2.1337246894836426, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2377912774682045, "step": 3146 }, { "epoch": 0.19675, "grad_norm": 2.46875, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 7.6371, "loss/crossentropy": 2.1832423210144043, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2349269688129425, "step": 3148 }, { "epoch": 0.196875, "grad_norm": 2.515625, "grad_norm_var": 0.0158203125, "learning_rate": 0.0001, "loss": 7.6147, "loss/crossentropy": 2.2364492416381836, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24817125499248505, "step": 3150 }, { "epoch": 0.197, "grad_norm": 2.21875, "grad_norm_var": 0.019254557291666665, "learning_rate": 0.0001, "loss": 7.5449, "loss/crossentropy": 2.30988085269928, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.26015302538871765, "step": 3152 }, { "epoch": 0.197125, "grad_norm": 2.671875, "grad_norm_var": 0.020392862955729167, "learning_rate": 0.0001, "loss": 7.7401, "loss/crossentropy": 2.053266227245331, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24950231611728668, "step": 3154 }, { "epoch": 0.19725, "grad_norm": 2.640625, "grad_norm_var": 0.021126302083333333, "learning_rate": 0.0001, "loss": 7.6193, "loss/crossentropy": 2.2058684825897217, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23218996822834015, "step": 3156 }, { "epoch": 0.197375, "grad_norm": 2.671875, "grad_norm_var": 0.031148274739583332, "learning_rate": 0.0001, "loss": 7.83, "loss/crossentropy": 2.3524067401885986, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24826574325561523, "step": 3158 }, { "epoch": 0.1975, "grad_norm": 2.609375, "grad_norm_var": 0.032079060872395836, "learning_rate": 0.0001, "loss": 7.6028, "loss/crossentropy": 2.124649167060852, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23834124207496643, "step": 3160 }, { "epoch": 0.197625, "grad_norm": 3.203125, "grad_norm_var": 0.05683186848958333, "learning_rate": 0.0001, "loss": 7.6409, "loss/crossentropy": 2.170323371887207, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24094149470329285, "step": 3162 }, { "epoch": 0.19775, "grad_norm": 2.65625, "grad_norm_var": 0.06315816243489583, "learning_rate": 0.0001, "loss": 7.7351, "loss/crossentropy": 2.340814709663391, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2378515675663948, "step": 3164 }, { "epoch": 0.197875, "grad_norm": 2.640625, "grad_norm_var": 0.0582672119140625, "learning_rate": 0.0001, "loss": 7.7131, "loss/crossentropy": 2.454757571220398, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25292622298002243, "step": 3166 }, { "epoch": 0.198, "grad_norm": 2.59375, "grad_norm_var": 0.0470703125, "learning_rate": 0.0001, "loss": 7.7054, "loss/crossentropy": 2.3518433570861816, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2326052561402321, "step": 3168 }, { "epoch": 0.198125, "grad_norm": 2.375, "grad_norm_var": 0.049103800455729166, "learning_rate": 0.0001, "loss": 7.8036, "loss/crossentropy": 2.3869314193725586, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2253723442554474, "step": 3170 }, { "epoch": 0.19825, "grad_norm": 2.453125, "grad_norm_var": 0.05646158854166667, "learning_rate": 0.0001, "loss": 7.549, "loss/crossentropy": 2.1406772136688232, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2136363908648491, "step": 3172 }, { "epoch": 0.198375, "grad_norm": 2.65625, "grad_norm_var": 0.050959269205729164, "learning_rate": 0.0001, "loss": 7.7547, "loss/crossentropy": 2.276672065258026, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2241700440645218, "step": 3174 }, { "epoch": 0.1985, "grad_norm": 2.234375, "grad_norm_var": 0.056696573893229164, "learning_rate": 0.0001, "loss": 7.5706, "loss/crossentropy": 2.0704278349876404, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22979146987199783, "step": 3176 }, { "epoch": 0.198625, "grad_norm": 2.625, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 7.6319, "loss/crossentropy": 2.155359983444214, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23411893844604492, "step": 3178 }, { "epoch": 0.19875, "grad_norm": 2.890625, "grad_norm_var": 0.026708984375, "learning_rate": 0.0001, "loss": 7.6763, "loss/crossentropy": 2.17472767829895, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2461443468928337, "step": 3180 }, { "epoch": 0.198875, "grad_norm": 2.515625, "grad_norm_var": 0.02847900390625, "learning_rate": 0.0001, "loss": 7.61, "loss/crossentropy": 2.401307702064514, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2419404238462448, "step": 3182 }, { "epoch": 0.199, "grad_norm": 2.359375, "grad_norm_var": 0.03395894368489583, "learning_rate": 0.0001, "loss": 7.5815, "loss/crossentropy": 2.303532361984253, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23136408627033234, "step": 3184 }, { "epoch": 0.199125, "grad_norm": 2.8125, "grad_norm_var": 0.03798726399739583, "learning_rate": 0.0001, "loss": 7.7585, "loss/crossentropy": 2.691552758216858, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24934251606464386, "step": 3186 }, { "epoch": 0.19925, "grad_norm": 2.234375, "grad_norm_var": 0.041975911458333334, "learning_rate": 0.0001, "loss": 7.6867, "loss/crossentropy": 2.1433998346328735, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23058529198169708, "step": 3188 }, { "epoch": 0.199375, "grad_norm": 2.484375, "grad_norm_var": 0.04368387858072917, "learning_rate": 0.0001, "loss": 7.6658, "loss/crossentropy": 2.075712561607361, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23563528060913086, "step": 3190 }, { "epoch": 0.1995, "grad_norm": 2.515625, "grad_norm_var": 0.038849894205729166, "learning_rate": 0.0001, "loss": 7.6327, "loss/crossentropy": 2.239920735359192, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24973846971988678, "step": 3192 }, { "epoch": 0.199625, "grad_norm": 2.5625, "grad_norm_var": 0.03837788899739583, "learning_rate": 0.0001, "loss": 7.6491, "loss/crossentropy": 2.2711654901504517, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2149556428194046, "step": 3194 }, { "epoch": 0.19975, "grad_norm": 2.703125, "grad_norm_var": 0.033543904622395836, "learning_rate": 0.0001, "loss": 7.6259, "loss/crossentropy": 2.2742727994918823, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24378067255020142, "step": 3196 }, { "epoch": 0.199875, "grad_norm": 2.296875, "grad_norm_var": 0.03476155598958333, "learning_rate": 0.0001, "loss": 7.6477, "loss/crossentropy": 2.074104130268097, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22755059599876404, "step": 3198 }, { "epoch": 0.2, "grad_norm": 2.71875, "grad_norm_var": 0.03190816243489583, "learning_rate": 0.0001, "loss": 7.4978, "loss/crossentropy": 2.2103521823883057, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22538809478282928, "step": 3200 }, { "epoch": 0.200125, "grad_norm": 2.28125, "grad_norm_var": 0.027132161458333335, "learning_rate": 0.0001, "loss": 7.4522, "loss/crossentropy": 2.262304186820984, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22689391672611237, "step": 3202 }, { "epoch": 0.20025, "grad_norm": 2.484375, "grad_norm_var": 0.024527994791666667, "learning_rate": 0.0001, "loss": 7.6634, "loss/crossentropy": 2.159933626651764, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2550048828125, "step": 3204 }, { "epoch": 0.200375, "grad_norm": 2.5, "grad_norm_var": 0.022847493489583332, "learning_rate": 0.0001, "loss": 7.7359, "loss/crossentropy": 2.2928545475006104, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2492647022008896, "step": 3206 }, { "epoch": 0.2005, "grad_norm": 2.453125, "grad_norm_var": 0.022435506184895832, "learning_rate": 0.0001, "loss": 7.6104, "loss/crossentropy": 2.151831030845642, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21393808722496033, "step": 3208 }, { "epoch": 0.200625, "grad_norm": 2.46875, "grad_norm_var": 0.022005208333333335, "learning_rate": 0.0001, "loss": 7.8164, "loss/crossentropy": 2.166096329689026, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23471974581480026, "step": 3210 }, { "epoch": 0.20075, "grad_norm": 2.734375, "grad_norm_var": 0.018277994791666665, "learning_rate": 0.0001, "loss": 7.565, "loss/crossentropy": 2.181807518005371, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23415996134281158, "step": 3212 }, { "epoch": 0.200875, "grad_norm": 2.453125, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 7.5114, "loss/crossentropy": 2.1912107467651367, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22295787930488586, "step": 3214 }, { "epoch": 0.201, "grad_norm": 2.453125, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 7.4921, "loss/crossentropy": 2.297171950340271, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25197841227054596, "step": 3216 }, { "epoch": 0.201125, "grad_norm": 2.359375, "grad_norm_var": 0.015364583333333333, "learning_rate": 0.0001, "loss": 7.7764, "loss/crossentropy": 2.484106659889221, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2731506675481796, "step": 3218 }, { "epoch": 0.20125, "grad_norm": 2.359375, "grad_norm_var": 0.017975870768229166, "learning_rate": 0.0001, "loss": 7.6025, "loss/crossentropy": 2.156631350517273, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21255087107419968, "step": 3220 }, { "epoch": 0.201375, "grad_norm": 2.421875, "grad_norm_var": 0.0179351806640625, "learning_rate": 0.0001, "loss": 7.5934, "loss/crossentropy": 2.3413909673690796, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22059939801692963, "step": 3222 }, { "epoch": 0.2015, "grad_norm": 3.203125, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 7.6958, "loss/crossentropy": 2.313757300376892, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23688851296901703, "step": 3224 }, { "epoch": 0.201625, "grad_norm": 2.640625, "grad_norm_var": 0.052643839518229166, "learning_rate": 0.0001, "loss": 7.5624, "loss/crossentropy": 2.3218902349472046, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23602160066366196, "step": 3226 }, { "epoch": 0.20175, "grad_norm": 2.421875, "grad_norm_var": 0.0511138916015625, "learning_rate": 0.0001, "loss": 7.6753, "loss/crossentropy": 2.6514742374420166, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2617443650960922, "step": 3228 }, { "epoch": 0.201875, "grad_norm": 2.328125, "grad_norm_var": 0.05115559895833333, "learning_rate": 0.0001, "loss": 7.7007, "loss/crossentropy": 2.27648389339447, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23478404432535172, "step": 3230 }, { "epoch": 0.202, "grad_norm": 2.75, "grad_norm_var": 0.05434468587239583, "learning_rate": 0.0001, "loss": 7.7911, "loss/crossentropy": 2.3282746076583862, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24819976091384888, "step": 3232 }, { "epoch": 0.202125, "grad_norm": 2.359375, "grad_norm_var": 0.05576883951822917, "learning_rate": 0.0001, "loss": 7.5815, "loss/crossentropy": 2.0597460865974426, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23130206763744354, "step": 3234 }, { "epoch": 0.20225, "grad_norm": 2.484375, "grad_norm_var": 0.04765625, "learning_rate": 0.0001, "loss": 7.777, "loss/crossentropy": 2.3545076847076416, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22461315244436264, "step": 3236 }, { "epoch": 0.202375, "grad_norm": 2.734375, "grad_norm_var": 0.045506795247395836, "learning_rate": 0.0001, "loss": 7.6893, "loss/crossentropy": 2.2642041444778442, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24759702384471893, "step": 3238 }, { "epoch": 0.2025, "grad_norm": 2.359375, "grad_norm_var": 0.020438639322916667, "learning_rate": 0.0001, "loss": 7.5394, "loss/crossentropy": 2.405397891998291, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24760686606168747, "step": 3240 }, { "epoch": 0.202625, "grad_norm": 2.4375, "grad_norm_var": 0.018050130208333334, "learning_rate": 0.0001, "loss": 7.5978, "loss/crossentropy": 1.8945466876029968, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20826154947280884, "step": 3242 }, { "epoch": 0.20275, "grad_norm": 2.59375, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 7.6012, "loss/crossentropy": 2.2214646339416504, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21611948311328888, "step": 3244 }, { "epoch": 0.202875, "grad_norm": 2.40625, "grad_norm_var": 0.01685791015625, "learning_rate": 0.0001, "loss": 7.5799, "loss/crossentropy": 2.41989004611969, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2531380206346512, "step": 3246 }, { "epoch": 0.203, "grad_norm": 2.75, "grad_norm_var": 0.016499837239583332, "learning_rate": 0.0001, "loss": 7.6609, "loss/crossentropy": 2.1457839012145996, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22547265142202377, "step": 3248 }, { "epoch": 0.203125, "grad_norm": 2.515625, "grad_norm_var": 0.011815388997395834, "learning_rate": 0.0001, "loss": 7.6925, "loss/crossentropy": 2.202640414237976, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24964796006679535, "step": 3250 }, { "epoch": 0.20325, "grad_norm": 2.59375, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 7.8597, "loss/crossentropy": 2.2329805493354797, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.21109139919281006, "step": 3252 }, { "epoch": 0.203375, "grad_norm": 2.546875, "grad_norm_var": 0.013602701822916667, "learning_rate": 0.0001, "loss": 7.6793, "loss/crossentropy": 2.37657368183136, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22429364919662476, "step": 3254 }, { "epoch": 0.2035, "grad_norm": 2.515625, "grad_norm_var": 0.013654581705729167, "learning_rate": 0.0001, "loss": 7.5988, "loss/crossentropy": 2.240600347518921, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24599803984165192, "step": 3256 }, { "epoch": 0.203625, "grad_norm": 2.296875, "grad_norm_var": 0.020670572916666668, "learning_rate": 0.0001, "loss": 7.6654, "loss/crossentropy": 2.5686757564544678, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.263079509139061, "step": 3258 }, { "epoch": 0.20375, "grad_norm": 2.53125, "grad_norm_var": 0.023265584309895834, "learning_rate": 0.0001, "loss": 7.824, "loss/crossentropy": 2.150991916656494, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2586375027894974, "step": 3260 }, { "epoch": 0.203875, "grad_norm": 2.453125, "grad_norm_var": 0.026048787434895835, "learning_rate": 0.0001, "loss": 7.6643, "loss/crossentropy": 2.370198965072632, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23691494762897491, "step": 3262 }, { "epoch": 0.204, "grad_norm": 2.28125, "grad_norm_var": 0.029390462239583335, "learning_rate": 0.0001, "loss": 7.5166, "loss/crossentropy": 2.2037036418914795, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23511488735675812, "step": 3264 }, { "epoch": 0.204125, "grad_norm": 2.625, "grad_norm_var": 0.032548014322916666, "learning_rate": 0.0001, "loss": 7.631, "loss/crossentropy": 2.2161275148391724, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22294826805591583, "step": 3266 }, { "epoch": 0.20425, "grad_norm": 3.015625, "grad_norm_var": 0.06555887858072916, "learning_rate": 0.0001, "loss": 7.8449, "loss/crossentropy": 2.403158664703369, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24931098520755768, "step": 3268 }, { "epoch": 0.204375, "grad_norm": 2.4375, "grad_norm_var": 0.06297098795572917, "learning_rate": 0.0001, "loss": 7.8327, "loss/crossentropy": 2.4231287240982056, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24163633584976196, "step": 3270 }, { "epoch": 0.2045, "grad_norm": 2.53125, "grad_norm_var": 0.06093648274739583, "learning_rate": 0.0001, "loss": 7.4887, "loss/crossentropy": 2.196571111679077, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2219029664993286, "step": 3272 }, { "epoch": 0.204625, "grad_norm": 2.5, "grad_norm_var": 0.05232645670572917, "learning_rate": 0.0001, "loss": 7.6319, "loss/crossentropy": 2.336767315864563, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23825974017381668, "step": 3274 }, { "epoch": 0.20475, "grad_norm": 2.25, "grad_norm_var": 0.05592041015625, "learning_rate": 0.0001, "loss": 7.4521, "loss/crossentropy": 1.9283623099327087, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2104254812002182, "step": 3276 }, { "epoch": 0.204875, "grad_norm": 2.75, "grad_norm_var": 0.05458882649739583, "learning_rate": 0.0001, "loss": 7.7259, "loss/crossentropy": 2.250017523765564, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23542233556509018, "step": 3278 }, { "epoch": 0.205, "grad_norm": 2.265625, "grad_norm_var": 0.053807576497395836, "learning_rate": 0.0001, "loss": 7.5853, "loss/crossentropy": 2.0355631709098816, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21646135300397873, "step": 3280 }, { "epoch": 0.205125, "grad_norm": 2.53125, "grad_norm_var": 0.056761678059895834, "learning_rate": 0.0001, "loss": 7.5638, "loss/crossentropy": 2.1133495569229126, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2248711958527565, "step": 3282 }, { "epoch": 0.20525, "grad_norm": 2.421875, "grad_norm_var": 0.0180816650390625, "learning_rate": 0.0001, "loss": 7.6095, "loss/crossentropy": 2.4140706062316895, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22342108935117722, "step": 3284 }, { "epoch": 0.205375, "grad_norm": 2.453125, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 7.6281, "loss/crossentropy": 2.4801841974258423, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24090874940156937, "step": 3286 }, { "epoch": 0.2055, "grad_norm": 2.671875, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 7.6194, "loss/crossentropy": 2.333125591278076, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23281607031822205, "step": 3288 }, { "epoch": 0.205625, "grad_norm": 2.328125, "grad_norm_var": 0.022443644205729165, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.057736098766327, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.21230417490005493, "step": 3290 }, { "epoch": 0.20575, "grad_norm": 2.5, "grad_norm_var": 0.019025675455729165, "learning_rate": 0.0001, "loss": 7.5554, "loss/crossentropy": 2.2296417951583862, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24230563640594482, "step": 3292 }, { "epoch": 0.205875, "grad_norm": 2.640625, "grad_norm_var": 0.0163482666015625, "learning_rate": 0.0001, "loss": 7.5161, "loss/crossentropy": 2.4877594709396362, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23580122739076614, "step": 3294 }, { "epoch": 0.206, "grad_norm": 2.65625, "grad_norm_var": 0.015816243489583333, "learning_rate": 0.0001, "loss": 7.6607, "loss/crossentropy": 2.22783100605011, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23493072390556335, "step": 3296 }, { "epoch": 0.206125, "grad_norm": 3.484375, "grad_norm_var": 0.06994527180989583, "learning_rate": 0.0001, "loss": 7.548, "loss/crossentropy": 2.177275240421295, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23492375016212463, "step": 3298 }, { "epoch": 0.20625, "grad_norm": 2.1875, "grad_norm_var": 0.07803446451822917, "learning_rate": 0.0001, "loss": 7.612, "loss/crossentropy": 2.100243628025055, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21778713911771774, "step": 3300 }, { "epoch": 0.206375, "grad_norm": 2.40625, "grad_norm_var": 0.08179931640625, "learning_rate": 0.0001, "loss": 7.6651, "loss/crossentropy": 2.612051248550415, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.236809641122818, "step": 3302 }, { "epoch": 0.2065, "grad_norm": 2.53125, "grad_norm_var": 0.10703125, "learning_rate": 0.0001, "loss": 7.7176, "loss/crossentropy": 2.3358840942382812, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22349786013364792, "step": 3304 }, { "epoch": 0.206625, "grad_norm": 2.390625, "grad_norm_var": 0.10436197916666666, "learning_rate": 0.0001, "loss": 7.6017, "loss/crossentropy": 2.0664124488830566, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22234025597572327, "step": 3306 }, { "epoch": 0.20675, "grad_norm": 2.765625, "grad_norm_var": 0.105126953125, "learning_rate": 0.0001, "loss": 7.7791, "loss/crossentropy": 2.3125079870224, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22895997017621994, "step": 3308 }, { "epoch": 0.206875, "grad_norm": 2.5625, "grad_norm_var": 0.10261942545572916, "learning_rate": 0.0001, "loss": 7.761, "loss/crossentropy": 2.0990543365478516, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22909457981586456, "step": 3310 }, { "epoch": 0.207, "grad_norm": 2.359375, "grad_norm_var": 0.11274312337239584, "learning_rate": 0.0001, "loss": 7.6989, "loss/crossentropy": 2.3832221031188965, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23881246894598007, "step": 3312 }, { "epoch": 0.207125, "grad_norm": 2.5625, "grad_norm_var": 0.053099568684895834, "learning_rate": 0.0001, "loss": 7.5829, "loss/crossentropy": 2.070538818836212, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.22222436219453812, "step": 3314 }, { "epoch": 0.20725, "grad_norm": 2.359375, "grad_norm_var": 0.0491607666015625, "learning_rate": 0.0001, "loss": 7.7099, "loss/crossentropy": 2.0685949325561523, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.225556842982769, "step": 3316 }, { "epoch": 0.207375, "grad_norm": 2.453125, "grad_norm_var": 0.0464752197265625, "learning_rate": 0.0001, "loss": 7.5792, "loss/crossentropy": 2.2039034366607666, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23356658220291138, "step": 3318 }, { "epoch": 0.2075, "grad_norm": 2.359375, "grad_norm_var": 0.017772420247395834, "learning_rate": 0.0001, "loss": 7.6484, "loss/crossentropy": 2.520377278327942, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23397985100746155, "step": 3320 }, { "epoch": 0.207625, "grad_norm": 2.421875, "grad_norm_var": 0.018724568684895835, "learning_rate": 0.0001, "loss": 7.6201, "loss/crossentropy": 2.232245087623596, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2378198802471161, "step": 3322 }, { "epoch": 0.20775, "grad_norm": 2.703125, "grad_norm_var": 0.017574055989583334, "learning_rate": 0.0001, "loss": 7.7734, "loss/crossentropy": 2.3666106462478638, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22776059806346893, "step": 3324 }, { "epoch": 0.207875, "grad_norm": 2.453125, "grad_norm_var": 0.026090494791666665, "learning_rate": 0.0001, "loss": 7.692, "loss/crossentropy": 2.4405359029769897, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23942655324935913, "step": 3326 }, { "epoch": 0.208, "grad_norm": 2.3125, "grad_norm_var": 0.026097615559895832, "learning_rate": 0.0001, "loss": 7.4516, "loss/crossentropy": 2.165894627571106, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21770837903022766, "step": 3328 }, { "epoch": 0.208125, "grad_norm": 2.3125, "grad_norm_var": 0.026106770833333334, "learning_rate": 0.0001, "loss": 7.5851, "loss/crossentropy": 2.2975982427597046, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24745838344097137, "step": 3330 }, { "epoch": 0.20825, "grad_norm": 2.28125, "grad_norm_var": 0.0307769775390625, "learning_rate": 0.0001, "loss": 7.5979, "loss/crossentropy": 2.1849515438079834, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23390965163707733, "step": 3332 }, { "epoch": 0.208375, "grad_norm": 2.484375, "grad_norm_var": 0.028743489583333334, "learning_rate": 0.0001, "loss": 7.7235, "loss/crossentropy": 2.336664915084839, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22321896255016327, "step": 3334 }, { "epoch": 0.2085, "grad_norm": 2.625, "grad_norm_var": 0.029752604166666665, "learning_rate": 0.0001, "loss": 7.6849, "loss/crossentropy": 2.4030131101608276, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24722521752119064, "step": 3336 }, { "epoch": 0.208625, "grad_norm": 2.40625, "grad_norm_var": 0.028348795572916665, "learning_rate": 0.0001, "loss": 7.6468, "loss/crossentropy": 2.451479196548462, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2548409700393677, "step": 3338 }, { "epoch": 0.20875, "grad_norm": 2.53125, "grad_norm_var": 0.024079386393229166, "learning_rate": 0.0001, "loss": 7.7279, "loss/crossentropy": 2.391486406326294, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24848993867635727, "step": 3340 }, { "epoch": 0.208875, "grad_norm": 2.5625, "grad_norm_var": 0.015412394205729167, "learning_rate": 0.0001, "loss": 7.6761, "loss/crossentropy": 2.242367148399353, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23639392107725143, "step": 3342 }, { "epoch": 0.209, "grad_norm": 2.296875, "grad_norm_var": 0.016487630208333333, "learning_rate": 0.0001, "loss": 7.6526, "loss/crossentropy": 2.131048798561096, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2150106355547905, "step": 3344 }, { "epoch": 0.209125, "grad_norm": 2.359375, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.4834, "loss/crossentropy": 2.2289204597473145, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2406204640865326, "step": 3346 }, { "epoch": 0.20925, "grad_norm": 3.078125, "grad_norm_var": 0.04006245930989583, "learning_rate": 0.0001, "loss": 7.6395, "loss/crossentropy": 2.335645318031311, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2360967919230461, "step": 3348 }, { "epoch": 0.209375, "grad_norm": 2.3125, "grad_norm_var": 0.0597076416015625, "learning_rate": 0.0001, "loss": 7.6344, "loss/crossentropy": 2.3322980403900146, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2533845752477646, "step": 3350 }, { "epoch": 0.2095, "grad_norm": 2.328125, "grad_norm_var": 0.06763916015625, "learning_rate": 0.0001, "loss": 7.394, "loss/crossentropy": 2.0071592926979065, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21447932720184326, "step": 3352 }, { "epoch": 0.209625, "grad_norm": 2.671875, "grad_norm_var": 0.07714436848958334, "learning_rate": 0.0001, "loss": 7.5678, "loss/crossentropy": 2.3756041526794434, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23200811445713043, "step": 3354 }, { "epoch": 0.20975, "grad_norm": 2.34375, "grad_norm_var": 0.08035380045572917, "learning_rate": 0.0001, "loss": 7.4439, "loss/crossentropy": 1.9585599303245544, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.19989113509655, "step": 3356 }, { "epoch": 0.209875, "grad_norm": 2.640625, "grad_norm_var": 0.08413798014322917, "learning_rate": 0.0001, "loss": 7.3117, "loss/crossentropy": 2.1502009630203247, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22656698524951935, "step": 3358 }, { "epoch": 0.21, "grad_norm": 2.375, "grad_norm_var": 0.077978515625, "learning_rate": 0.0001, "loss": 7.6045, "loss/crossentropy": 2.3314971923828125, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2628200501203537, "step": 3360 }, { "epoch": 0.210125, "grad_norm": 2.375, "grad_norm_var": 0.07669169108072917, "learning_rate": 0.0001, "loss": 7.6801, "loss/crossentropy": 2.407312273979187, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2756526470184326, "step": 3362 }, { "epoch": 0.21025, "grad_norm": 2.578125, "grad_norm_var": 0.06636962890625, "learning_rate": 0.0001, "loss": 7.7946, "loss/crossentropy": 2.308638334274292, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22128069400787354, "step": 3364 }, { "epoch": 0.210375, "grad_norm": 2.625, "grad_norm_var": 0.048192342122395836, "learning_rate": 0.0001, "loss": 7.4601, "loss/crossentropy": 2.227054715156555, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.223979651927948, "step": 3366 }, { "epoch": 0.2105, "grad_norm": 2.25, "grad_norm_var": 0.04175516764322917, "learning_rate": 0.0001, "loss": 7.3921, "loss/crossentropy": 2.1322200298309326, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25718845427036285, "step": 3368 }, { "epoch": 0.210625, "grad_norm": 2.640625, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 7.5104, "loss/crossentropy": 2.1420618891716003, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2320111319422722, "step": 3370 }, { "epoch": 0.21075, "grad_norm": 2.3125, "grad_norm_var": 0.03416239420572917, "learning_rate": 0.0001, "loss": 7.7232, "loss/crossentropy": 2.3579647541046143, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2402171939611435, "step": 3372 }, { "epoch": 0.210875, "grad_norm": 3.34375, "grad_norm_var": 0.07431233723958333, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.3642072677612305, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22273917496204376, "step": 3374 }, { "epoch": 0.211, "grad_norm": 2.328125, "grad_norm_var": 0.07669270833333333, "learning_rate": 0.0001, "loss": 7.4455, "loss/crossentropy": 2.2815465927124023, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23410624265670776, "step": 3376 }, { "epoch": 0.211125, "grad_norm": 2.546875, "grad_norm_var": 0.07057291666666667, "learning_rate": 0.0001, "loss": 7.5148, "loss/crossentropy": 2.066421687602997, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24254445731639862, "step": 3378 }, { "epoch": 0.21125, "grad_norm": 2.453125, "grad_norm_var": 0.06165364583333333, "learning_rate": 0.0001, "loss": 7.5341, "loss/crossentropy": 2.347060441970825, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.25217752158641815, "step": 3380 }, { "epoch": 0.211375, "grad_norm": 2.640625, "grad_norm_var": 0.06360270182291666, "learning_rate": 0.0001, "loss": 7.7507, "loss/crossentropy": 2.1539812088012695, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26475973427295685, "step": 3382 }, { "epoch": 0.2115, "grad_norm": 2.609375, "grad_norm_var": 0.0606353759765625, "learning_rate": 0.0001, "loss": 7.7259, "loss/crossentropy": 2.1600695848464966, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2167871668934822, "step": 3384 }, { "epoch": 0.211625, "grad_norm": 2.828125, "grad_norm_var": 0.06337788899739584, "learning_rate": 0.0001, "loss": 7.6085, "loss/crossentropy": 2.286532163619995, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2332899570465088, "step": 3386 }, { "epoch": 0.21175, "grad_norm": 2.515625, "grad_norm_var": 0.05858968098958333, "learning_rate": 0.0001, "loss": 7.562, "loss/crossentropy": 2.1641604900360107, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2164478898048401, "step": 3388 }, { "epoch": 0.211875, "grad_norm": 2.84375, "grad_norm_var": 0.0242095947265625, "learning_rate": 0.0001, "loss": 7.7682, "loss/crossentropy": 2.3409924507141113, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23107275366783142, "step": 3390 }, { "epoch": 0.212, "grad_norm": 2.65625, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 7.5186, "loss/crossentropy": 2.2500524520874023, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.213922381401062, "step": 3392 }, { "epoch": 0.212125, "grad_norm": 2.328125, "grad_norm_var": 0.038426717122395836, "learning_rate": 0.0001, "loss": 7.5321, "loss/crossentropy": 2.1275582909584045, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22479213774204254, "step": 3394 }, { "epoch": 0.21225, "grad_norm": 2.578125, "grad_norm_var": 0.0404449462890625, "learning_rate": 0.0001, "loss": 7.5174, "loss/crossentropy": 2.212312698364258, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24751190841197968, "step": 3396 }, { "epoch": 0.212375, "grad_norm": 2.328125, "grad_norm_var": 0.042236328125, "learning_rate": 0.0001, "loss": 7.4043, "loss/crossentropy": 1.9285815954208374, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.19548063725233078, "step": 3398 }, { "epoch": 0.2125, "grad_norm": 2.5625, "grad_norm_var": 0.0437408447265625, "learning_rate": 0.0001, "loss": 7.6525, "loss/crossentropy": 2.2272496223449707, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24632243812084198, "step": 3400 }, { "epoch": 0.212625, "grad_norm": 2.46875, "grad_norm_var": 0.03942057291666667, "learning_rate": 0.0001, "loss": 7.6239, "loss/crossentropy": 2.521925210952759, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23339618742465973, "step": 3402 }, { "epoch": 0.21275, "grad_norm": 2.296875, "grad_norm_var": 0.04421284993489583, "learning_rate": 0.0001, "loss": 7.5628, "loss/crossentropy": 2.089443802833557, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2286679819226265, "step": 3404 }, { "epoch": 0.212875, "grad_norm": 2.59375, "grad_norm_var": 0.03938700358072917, "learning_rate": 0.0001, "loss": 7.6109, "loss/crossentropy": 2.3914257287979126, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23510746657848358, "step": 3406 }, { "epoch": 0.213, "grad_norm": 2.171875, "grad_norm_var": 0.038863118489583334, "learning_rate": 0.0001, "loss": 7.456, "loss/crossentropy": 2.1810909509658813, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21339446306228638, "step": 3408 }, { "epoch": 0.213125, "grad_norm": 2.640625, "grad_norm_var": 0.031126912434895834, "learning_rate": 0.0001, "loss": 7.6262, "loss/crossentropy": 2.068147301673889, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23478543758392334, "step": 3410 }, { "epoch": 0.21325, "grad_norm": 2.4375, "grad_norm_var": 0.022786458333333332, "learning_rate": 0.0001, "loss": 7.6825, "loss/crossentropy": 2.299628734588623, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24276187270879745, "step": 3412 }, { "epoch": 0.213375, "grad_norm": 2.53125, "grad_norm_var": 0.0220703125, "learning_rate": 0.0001, "loss": 7.5285, "loss/crossentropy": 2.539917826652527, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24582375586032867, "step": 3414 }, { "epoch": 0.2135, "grad_norm": 2.578125, "grad_norm_var": 0.016039021809895835, "learning_rate": 0.0001, "loss": 7.7053, "loss/crossentropy": 2.2273647785186768, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24760635942220688, "step": 3416 }, { "epoch": 0.213625, "grad_norm": 2.421875, "grad_norm_var": 0.023143513997395834, "learning_rate": 0.0001, "loss": 7.5504, "loss/crossentropy": 2.0793908834457397, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20350514352321625, "step": 3418 }, { "epoch": 0.21375, "grad_norm": 2.6875, "grad_norm_var": 0.030887858072916666, "learning_rate": 0.0001, "loss": 7.6719, "loss/crossentropy": 2.2238458395004272, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23141219466924667, "step": 3420 }, { "epoch": 0.213875, "grad_norm": 2.78125, "grad_norm_var": 0.03818359375, "learning_rate": 0.0001, "loss": 7.6894, "loss/crossentropy": 2.2496371269226074, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24826089292764664, "step": 3422 }, { "epoch": 0.214, "grad_norm": 2.453125, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 7.5904, "loss/crossentropy": 2.2201942205429077, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22636444121599197, "step": 3424 }, { "epoch": 0.214125, "grad_norm": 2.5625, "grad_norm_var": 0.029117838541666666, "learning_rate": 0.0001, "loss": 7.7162, "loss/crossentropy": 2.387493371963501, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23339012265205383, "step": 3426 }, { "epoch": 0.21425, "grad_norm": 2.625, "grad_norm_var": 0.038263956705729164, "learning_rate": 0.0001, "loss": 7.4986, "loss/crossentropy": 2.1064809560775757, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.2052219733595848, "step": 3428 }, { "epoch": 0.214375, "grad_norm": 2.5, "grad_norm_var": 0.043603515625, "learning_rate": 0.0001, "loss": 7.7774, "loss/crossentropy": 2.430534839630127, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2444680631160736, "step": 3430 }, { "epoch": 0.2145, "grad_norm": 2.4375, "grad_norm_var": 0.0421539306640625, "learning_rate": 0.0001, "loss": 7.6541, "loss/crossentropy": 2.2829513549804688, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23072397708892822, "step": 3432 }, { "epoch": 0.214625, "grad_norm": 2.4375, "grad_norm_var": 0.047215779622395836, "learning_rate": 0.0001, "loss": 7.4546, "loss/crossentropy": 2.240882158279419, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24445254355669022, "step": 3434 }, { "epoch": 0.21475, "grad_norm": 2.8125, "grad_norm_var": 0.04260660807291667, "learning_rate": 0.0001, "loss": 7.585, "loss/crossentropy": 1.915956974029541, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.20426518470048904, "step": 3436 }, { "epoch": 0.214875, "grad_norm": 2.484375, "grad_norm_var": 0.038141886393229164, "learning_rate": 0.0001, "loss": 7.6817, "loss/crossentropy": 2.0629988312721252, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23025204241275787, "step": 3438 }, { "epoch": 0.215, "grad_norm": 2.390625, "grad_norm_var": 0.0386627197265625, "learning_rate": 0.0001, "loss": 7.4915, "loss/crossentropy": 2.1499475240707397, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2223310023546219, "step": 3440 }, { "epoch": 0.215125, "grad_norm": 2.640625, "grad_norm_var": 0.0455474853515625, "learning_rate": 0.0001, "loss": 7.6482, "loss/crossentropy": 2.323388457298279, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23185274004936218, "step": 3442 }, { "epoch": 0.21525, "grad_norm": 2.359375, "grad_norm_var": 0.0388580322265625, "learning_rate": 0.0001, "loss": 7.6131, "loss/crossentropy": 2.4512449502944946, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2369954064488411, "step": 3444 }, { "epoch": 0.215375, "grad_norm": 2.4375, "grad_norm_var": 0.032373046875, "learning_rate": 0.0001, "loss": 7.637, "loss/crossentropy": 2.382017970085144, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23922354727983475, "step": 3446 }, { "epoch": 0.2155, "grad_norm": 2.390625, "grad_norm_var": 0.03400065104166667, "learning_rate": 0.0001, "loss": 7.5746, "loss/crossentropy": 2.27813720703125, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2455897182226181, "step": 3448 }, { "epoch": 0.215625, "grad_norm": 2.5, "grad_norm_var": 0.024247233072916666, "learning_rate": 0.0001, "loss": 7.6352, "loss/crossentropy": 2.0362807512283325, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.24546430259943008, "step": 3450 }, { "epoch": 0.21575, "grad_norm": 2.390625, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 7.5596, "loss/crossentropy": 2.3591285943984985, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2355157434940338, "step": 3452 }, { "epoch": 0.215875, "grad_norm": 2.5625, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 7.4538, "loss/crossentropy": 2.0467506051063538, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2161174640059471, "step": 3454 }, { "epoch": 0.216, "grad_norm": 2.234375, "grad_norm_var": 0.020247395833333334, "learning_rate": 0.0001, "loss": 7.744, "loss/crossentropy": 2.383505702018738, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23139237612485886, "step": 3456 }, { "epoch": 0.216125, "grad_norm": 2.46875, "grad_norm_var": 0.007906087239583333, "learning_rate": 0.0001, "loss": 7.6497, "loss/crossentropy": 2.422740340232849, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2240883857011795, "step": 3458 }, { "epoch": 0.21625, "grad_norm": 2.4375, "grad_norm_var": 0.0074615478515625, "learning_rate": 0.0001, "loss": 7.6628, "loss/crossentropy": 2.494243025779724, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2455742135643959, "step": 3460 }, { "epoch": 0.216375, "grad_norm": 2.546875, "grad_norm_var": 0.0082672119140625, "learning_rate": 0.0001, "loss": 7.5369, "loss/crossentropy": 2.1609995365142822, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2432066798210144, "step": 3462 }, { "epoch": 0.2165, "grad_norm": 2.359375, "grad_norm_var": 0.0080230712890625, "learning_rate": 0.0001, "loss": 7.6719, "loss/crossentropy": 2.152750015258789, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22096887230873108, "step": 3464 }, { "epoch": 0.216625, "grad_norm": 2.5625, "grad_norm_var": 0.008275349934895834, "learning_rate": 0.0001, "loss": 7.6158, "loss/crossentropy": 2.2551519870758057, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22652066498994827, "step": 3466 }, { "epoch": 0.21675, "grad_norm": 2.390625, "grad_norm_var": 0.0111236572265625, "learning_rate": 0.0001, "loss": 7.6574, "loss/crossentropy": 2.2164549827575684, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23051265627145767, "step": 3468 }, { "epoch": 0.216875, "grad_norm": 2.328125, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 7.5793, "loss/crossentropy": 2.5120718479156494, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24404268711805344, "step": 3470 }, { "epoch": 0.217, "grad_norm": 2.390625, "grad_norm_var": 0.009358723958333334, "learning_rate": 0.0001, "loss": 7.4738, "loss/crossentropy": 2.171375274658203, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23342353105545044, "step": 3472 }, { "epoch": 0.217125, "grad_norm": 2.34375, "grad_norm_var": 0.00963134765625, "learning_rate": 0.0001, "loss": 7.6139, "loss/crossentropy": 2.095268964767456, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23750004172325134, "step": 3474 }, { "epoch": 0.21725, "grad_norm": 2.40625, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 7.6385, "loss/crossentropy": 2.187831997871399, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21564120054244995, "step": 3476 }, { "epoch": 0.217375, "grad_norm": 2.375, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 7.649, "loss/crossentropy": 2.339821934700012, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24484576284885406, "step": 3478 }, { "epoch": 0.2175, "grad_norm": 2.46875, "grad_norm_var": 0.010188802083333334, "learning_rate": 0.0001, "loss": 7.7182, "loss/crossentropy": 2.085095524787903, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21802203357219696, "step": 3480 }, { "epoch": 0.217625, "grad_norm": 2.453125, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 7.6739, "loss/crossentropy": 2.4005582332611084, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22154852002859116, "step": 3482 }, { "epoch": 0.21775, "grad_norm": 4.375, "grad_norm_var": 0.24265034993489584, "learning_rate": 0.0001, "loss": 7.5902, "loss/crossentropy": 2.235607147216797, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.3020609989762306, "step": 3484 }, { "epoch": 0.217875, "grad_norm": 2.703125, "grad_norm_var": 0.2362457275390625, "learning_rate": 0.0001, "loss": 7.5921, "loss/crossentropy": 2.37876033782959, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.238236665725708, "step": 3486 }, { "epoch": 0.218, "grad_norm": 2.828125, "grad_norm_var": 0.23375244140625, "learning_rate": 0.0001, "loss": 7.7093, "loss/crossentropy": 2.446916341781616, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2403736189007759, "step": 3488 }, { "epoch": 0.218125, "grad_norm": 2.28125, "grad_norm_var": 0.23952534993489583, "learning_rate": 0.0001, "loss": 7.6001, "loss/crossentropy": 2.703190565109253, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24328448623418808, "step": 3490 }, { "epoch": 0.21825, "grad_norm": 2.515625, "grad_norm_var": 0.23860677083333334, "learning_rate": 0.0001, "loss": 7.6135, "loss/crossentropy": 2.266252636909485, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2304093837738037, "step": 3492 }, { "epoch": 0.218375, "grad_norm": 3.0625, "grad_norm_var": 0.24362691243489584, "learning_rate": 0.0001, "loss": 7.6869, "loss/crossentropy": 2.2512608766555786, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23177290707826614, "step": 3494 }, { "epoch": 0.2185, "grad_norm": 2.390625, "grad_norm_var": 0.24628804524739584, "learning_rate": 0.0001, "loss": 7.8453, "loss/crossentropy": 2.3944746255874634, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22361087799072266, "step": 3496 }, { "epoch": 0.218625, "grad_norm": 2.34375, "grad_norm_var": 0.25588785807291664, "learning_rate": 0.0001, "loss": 7.3925, "loss/crossentropy": 1.9911410212516785, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2150706946849823, "step": 3498 }, { "epoch": 0.21875, "grad_norm": 2.84375, "grad_norm_var": 0.05894266764322917, "learning_rate": 0.0001, "loss": 7.6356, "loss/crossentropy": 2.290215253829956, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22882460057735443, "step": 3500 }, { "epoch": 0.218875, "grad_norm": 2.15625, "grad_norm_var": 0.07018941243489583, "learning_rate": 0.0001, "loss": 7.5163, "loss/crossentropy": 2.201116681098938, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.20892898738384247, "step": 3502 }, { "epoch": 0.219, "grad_norm": 2.5, "grad_norm_var": 0.06816304524739583, "learning_rate": 0.0001, "loss": 7.7937, "loss/crossentropy": 2.210504412651062, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21811866015195847, "step": 3504 }, { "epoch": 0.219125, "grad_norm": 2.5625, "grad_norm_var": 0.06256510416666666, "learning_rate": 0.0001, "loss": 7.6281, "loss/crossentropy": 2.355741262435913, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23753728717565536, "step": 3506 }, { "epoch": 0.21925, "grad_norm": 2.296875, "grad_norm_var": 0.06705729166666667, "learning_rate": 0.0001, "loss": 7.5304, "loss/crossentropy": 2.2776095867156982, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22738799452781677, "step": 3508 }, { "epoch": 0.219375, "grad_norm": 2.40625, "grad_norm_var": 0.046605428059895836, "learning_rate": 0.0001, "loss": 7.5032, "loss/crossentropy": 2.295590400695801, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2449091300368309, "step": 3510 }, { "epoch": 0.2195, "grad_norm": 2.671875, "grad_norm_var": 0.03515625, "learning_rate": 0.0001, "loss": 7.6288, "loss/crossentropy": 2.326041340827942, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2660403698682785, "step": 3512 }, { "epoch": 0.219625, "grad_norm": 2.28125, "grad_norm_var": 0.03609619140625, "learning_rate": 0.0001, "loss": 7.5858, "loss/crossentropy": 2.432402729988098, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25160669535398483, "step": 3514 }, { "epoch": 0.21975, "grad_norm": 2.890625, "grad_norm_var": 0.038895670572916666, "learning_rate": 0.0001, "loss": 7.818, "loss/crossentropy": 2.349487543106079, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23811831325292587, "step": 3516 }, { "epoch": 0.219875, "grad_norm": 2.546875, "grad_norm_var": 0.0508941650390625, "learning_rate": 0.0001, "loss": 7.7315, "loss/crossentropy": 2.3854116201400757, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24041782319545746, "step": 3518 }, { "epoch": 0.22, "grad_norm": 2.265625, "grad_norm_var": 0.048075358072916664, "learning_rate": 0.0001, "loss": 7.5721, "loss/crossentropy": 2.253451347351074, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2183404043316841, "step": 3520 }, { "epoch": 0.220125, "grad_norm": 2.453125, "grad_norm_var": 0.04853108723958333, "learning_rate": 0.0001, "loss": 7.4311, "loss/crossentropy": 1.989369809627533, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.218755841255188, "step": 3522 }, { "epoch": 0.22025, "grad_norm": 2.390625, "grad_norm_var": 0.046187337239583334, "learning_rate": 0.0001, "loss": 7.5011, "loss/crossentropy": 2.3954477310180664, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.254411518573761, "step": 3524 }, { "epoch": 0.220375, "grad_norm": 2.53125, "grad_norm_var": 0.04576416015625, "learning_rate": 0.0001, "loss": 7.5453, "loss/crossentropy": 1.9688183665275574, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2124113142490387, "step": 3526 }, { "epoch": 0.2205, "grad_norm": 2.40625, "grad_norm_var": 0.04296468098958333, "learning_rate": 0.0001, "loss": 7.5413, "loss/crossentropy": 2.304739475250244, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24053708463907242, "step": 3528 }, { "epoch": 0.220625, "grad_norm": 2.40625, "grad_norm_var": 0.040282185872395834, "learning_rate": 0.0001, "loss": 7.5082, "loss/crossentropy": 2.188236117362976, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2476644068956375, "step": 3530 }, { "epoch": 0.22075, "grad_norm": 2.28125, "grad_norm_var": 0.03193257649739583, "learning_rate": 0.0001, "loss": 7.4935, "loss/crossentropy": 2.452435612678528, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2143031656742096, "step": 3532 }, { "epoch": 0.220875, "grad_norm": 2.21875, "grad_norm_var": 0.012613932291666666, "learning_rate": 0.0001, "loss": 7.4794, "loss/crossentropy": 2.0797160863876343, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2122730240225792, "step": 3534 }, { "epoch": 0.221, "grad_norm": 2.484375, "grad_norm_var": 0.010595703125, "learning_rate": 0.0001, "loss": 7.4632, "loss/crossentropy": 2.4266319274902344, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2510553449392319, "step": 3536 }, { "epoch": 0.221125, "grad_norm": 2.203125, "grad_norm_var": 0.012450154622395833, "learning_rate": 0.0001, "loss": 7.6415, "loss/crossentropy": 2.059949517250061, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23185402899980545, "step": 3538 }, { "epoch": 0.22125, "grad_norm": 2.53125, "grad_norm_var": 0.01265869140625, "learning_rate": 0.0001, "loss": 7.6176, "loss/crossentropy": 2.5653083324432373, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23594635725021362, "step": 3540 }, { "epoch": 0.221375, "grad_norm": 2.265625, "grad_norm_var": 0.013667805989583334, "learning_rate": 0.0001, "loss": 7.481, "loss/crossentropy": 2.3301087617874146, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2452377825975418, "step": 3542 }, { "epoch": 0.2215, "grad_norm": 2.34375, "grad_norm_var": 0.0131011962890625, "learning_rate": 0.0001, "loss": 7.5606, "loss/crossentropy": 2.2409706115722656, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2660795971751213, "step": 3544 }, { "epoch": 0.221625, "grad_norm": 2.65625, "grad_norm_var": 0.020589192708333332, "learning_rate": 0.0001, "loss": 7.6275, "loss/crossentropy": 2.341962456703186, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23292769491672516, "step": 3546 }, { "epoch": 0.22175, "grad_norm": 2.546875, "grad_norm_var": 0.020503743489583334, "learning_rate": 0.0001, "loss": 7.5761, "loss/crossentropy": 2.27796733379364, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22092991322278976, "step": 3548 }, { "epoch": 0.221875, "grad_norm": 2.375, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.5742, "loss/crossentropy": 2.189824938774109, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23947207629680634, "step": 3550 }, { "epoch": 0.222, "grad_norm": 2.453125, "grad_norm_var": 0.017186482747395832, "learning_rate": 0.0001, "loss": 7.5813, "loss/crossentropy": 2.4192023277282715, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.24019119888544083, "step": 3552 }, { "epoch": 0.222125, "grad_norm": 2.375, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 2.158234715461731, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2341742068529129, "step": 3554 }, { "epoch": 0.22225, "grad_norm": 3.203125, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 7.378, "loss/crossentropy": 2.2715145349502563, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22904645651578903, "step": 3556 }, { "epoch": 0.222375, "grad_norm": 2.375, "grad_norm_var": 0.26266276041666664, "learning_rate": 0.0001, "loss": 7.6181, "loss/crossentropy": 2.2331241369247437, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25211699306964874, "step": 3558 }, { "epoch": 0.2225, "grad_norm": 2.375, "grad_norm_var": 0.2688140869140625, "learning_rate": 0.0001, "loss": 7.5199, "loss/crossentropy": 1.9353508949279785, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2270013615489006, "step": 3560 }, { "epoch": 0.222625, "grad_norm": 2.40625, "grad_norm_var": 0.27647196451822914, "learning_rate": 0.0001, "loss": 7.5403, "loss/crossentropy": 2.4099196195602417, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24662812799215317, "step": 3562 }, { "epoch": 0.22275, "grad_norm": 2.71875, "grad_norm_var": 0.27763264973958335, "learning_rate": 0.0001, "loss": 7.7591, "loss/crossentropy": 2.163403630256653, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22525054216384888, "step": 3564 }, { "epoch": 0.222875, "grad_norm": 2.484375, "grad_norm_var": 0.27356363932291666, "learning_rate": 0.0001, "loss": 7.6724, "loss/crossentropy": 2.3632307052612305, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.221424400806427, "step": 3566 }, { "epoch": 0.223, "grad_norm": 2.359375, "grad_norm_var": 0.2787394205729167, "learning_rate": 0.0001, "loss": 7.4518, "loss/crossentropy": 2.079641282558441, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22443066537380219, "step": 3568 }, { "epoch": 0.223125, "grad_norm": 2.578125, "grad_norm_var": 0.28662109375, "learning_rate": 0.0001, "loss": 7.5553, "loss/crossentropy": 2.0237990021705627, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21297388523817062, "step": 3570 }, { "epoch": 0.22325, "grad_norm": 2.625, "grad_norm_var": 0.26023763020833335, "learning_rate": 0.0001, "loss": 7.9066, "loss/crossentropy": 2.4305167198181152, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24352750182151794, "step": 3572 }, { "epoch": 0.223375, "grad_norm": 2.453125, "grad_norm_var": 0.03300679524739583, "learning_rate": 0.0001, "loss": 7.8042, "loss/crossentropy": 2.296820282936096, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23312810063362122, "step": 3574 }, { "epoch": 0.2235, "grad_norm": 2.65625, "grad_norm_var": 0.029866536458333332, "learning_rate": 0.0001, "loss": 7.7462, "loss/crossentropy": 2.3262380361557007, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22458947449922562, "step": 3576 }, { "epoch": 0.223625, "grad_norm": 2.6875, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 7.6477, "loss/crossentropy": 2.4565069675445557, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2450973466038704, "step": 3578 }, { "epoch": 0.22375, "grad_norm": 2.765625, "grad_norm_var": 0.028913370768229165, "learning_rate": 0.0001, "loss": 7.5068, "loss/crossentropy": 1.9984254240989685, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.19529377669095993, "step": 3580 }, { "epoch": 0.223875, "grad_norm": 2.5, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 7.5913, "loss/crossentropy": 2.2899560928344727, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25301460921764374, "step": 3582 }, { "epoch": 0.224, "grad_norm": 2.421875, "grad_norm_var": 0.026764933268229166, "learning_rate": 0.0001, "loss": 7.5833, "loss/crossentropy": 2.1312328577041626, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22322946041822433, "step": 3584 }, { "epoch": 0.224125, "grad_norm": 2.421875, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 7.6728, "loss/crossentropy": 2.5150551795959473, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2675798535346985, "step": 3586 }, { "epoch": 0.22425, "grad_norm": 2.5, "grad_norm_var": 0.0173492431640625, "learning_rate": 0.0001, "loss": 7.5826, "loss/crossentropy": 2.1855462789535522, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21716494113206863, "step": 3588 }, { "epoch": 0.224375, "grad_norm": 2.328125, "grad_norm_var": 0.0208892822265625, "learning_rate": 0.0001, "loss": 7.4135, "loss/crossentropy": 2.4238641262054443, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23732400685548782, "step": 3590 }, { "epoch": 0.2245, "grad_norm": 2.46875, "grad_norm_var": 0.020002237955729165, "learning_rate": 0.0001, "loss": 7.5734, "loss/crossentropy": 2.313872456550598, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2245703637599945, "step": 3592 }, { "epoch": 0.224625, "grad_norm": 2.421875, "grad_norm_var": 0.017822265625, "learning_rate": 0.0001, "loss": 7.6389, "loss/crossentropy": 2.149410605430603, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22927816212177277, "step": 3594 }, { "epoch": 0.22475, "grad_norm": 2.671875, "grad_norm_var": 0.013117472330729166, "learning_rate": 0.0001, "loss": 7.594, "loss/crossentropy": 2.228400468826294, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23162036389112473, "step": 3596 }, { "epoch": 0.224875, "grad_norm": 2.3125, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 7.6038, "loss/crossentropy": 2.181916832923889, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21812283247709274, "step": 3598 }, { "epoch": 0.225, "grad_norm": 2.484375, "grad_norm_var": 0.0110015869140625, "learning_rate": 0.0001, "loss": 7.7592, "loss/crossentropy": 2.3809478282928467, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2515050619840622, "step": 3600 }, { "epoch": 0.225125, "grad_norm": 2.34375, "grad_norm_var": 0.011324055989583333, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 2.1800352334976196, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20716014504432678, "step": 3602 }, { "epoch": 0.22525, "grad_norm": 2.328125, "grad_norm_var": 0.01744384765625, "learning_rate": 0.0001, "loss": 7.5785, "loss/crossentropy": 2.423554301261902, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24492305517196655, "step": 3604 }, { "epoch": 0.225375, "grad_norm": 2.78125, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 7.7038, "loss/crossentropy": 2.3537445068359375, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2731524705886841, "step": 3606 }, { "epoch": 0.2255, "grad_norm": 2.609375, "grad_norm_var": 0.04365234375, "learning_rate": 0.0001, "loss": 7.7234, "loss/crossentropy": 2.3835121393203735, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22908172756433487, "step": 3608 }, { "epoch": 0.225625, "grad_norm": 2.296875, "grad_norm_var": 0.04636942545572917, "learning_rate": 0.0001, "loss": 7.6703, "loss/crossentropy": 2.6490813493728638, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22531013935804367, "step": 3610 }, { "epoch": 0.22575, "grad_norm": 2.34375, "grad_norm_var": 0.05110575358072917, "learning_rate": 0.0001, "loss": 7.5759, "loss/crossentropy": 2.1871402263641357, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22304056584835052, "step": 3612 }, { "epoch": 0.225875, "grad_norm": 2.53125, "grad_norm_var": 0.0443511962890625, "learning_rate": 0.0001, "loss": 7.4932, "loss/crossentropy": 2.2581640481948853, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24387839436531067, "step": 3614 }, { "epoch": 0.226, "grad_norm": 2.5625, "grad_norm_var": 0.044489542643229164, "learning_rate": 0.0001, "loss": 7.7008, "loss/crossentropy": 2.2254581451416016, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22644620388746262, "step": 3616 }, { "epoch": 0.226125, "grad_norm": 2.359375, "grad_norm_var": 0.04413960774739583, "learning_rate": 0.0001, "loss": 7.5712, "loss/crossentropy": 2.0950043201446533, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24633889645338058, "step": 3618 }, { "epoch": 0.22625, "grad_norm": 2.296875, "grad_norm_var": 0.042292277018229164, "learning_rate": 0.0001, "loss": 7.6548, "loss/crossentropy": 2.2507599592208862, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22745974361896515, "step": 3620 }, { "epoch": 0.226375, "grad_norm": 2.609375, "grad_norm_var": 0.032613118489583336, "learning_rate": 0.0001, "loss": 7.6329, "loss/crossentropy": 2.114292323589325, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22122054547071457, "step": 3622 }, { "epoch": 0.2265, "grad_norm": 2.296875, "grad_norm_var": 0.011823527018229167, "learning_rate": 0.0001, "loss": 7.532, "loss/crossentropy": 2.210999310016632, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22689851373434067, "step": 3624 }, { "epoch": 0.226625, "grad_norm": 2.34375, "grad_norm_var": 0.014583333333333334, "learning_rate": 0.0001, "loss": 7.4216, "loss/crossentropy": 2.5903425216674805, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22328981757164001, "step": 3626 }, { "epoch": 0.22675, "grad_norm": 2.453125, "grad_norm_var": 0.01275634765625, "learning_rate": 0.0001, "loss": 7.5963, "loss/crossentropy": 2.3979709148406982, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2354452759027481, "step": 3628 }, { "epoch": 0.226875, "grad_norm": 2.34375, "grad_norm_var": 0.01240234375, "learning_rate": 0.0001, "loss": 7.4937, "loss/crossentropy": 2.308434844017029, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22410962730646133, "step": 3630 }, { "epoch": 0.227, "grad_norm": 2.140625, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 7.5951, "loss/crossentropy": 2.4131675958633423, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24779768288135529, "step": 3632 }, { "epoch": 0.227125, "grad_norm": 2.796875, "grad_norm_var": 0.024723307291666666, "learning_rate": 0.0001, "loss": 7.5681, "loss/crossentropy": 2.2655831575393677, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2255219966173172, "step": 3634 }, { "epoch": 0.22725, "grad_norm": 2.25, "grad_norm_var": 0.025487263997395832, "learning_rate": 0.0001, "loss": 7.4176, "loss/crossentropy": 2.3867735862731934, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24707113206386566, "step": 3636 }, { "epoch": 0.227375, "grad_norm": 2.46875, "grad_norm_var": 0.022834269205729167, "learning_rate": 0.0001, "loss": 7.532, "loss/crossentropy": 2.3354828357696533, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2387101650238037, "step": 3638 }, { "epoch": 0.2275, "grad_norm": 2.4375, "grad_norm_var": 0.022581990559895834, "learning_rate": 0.0001, "loss": 7.4607, "loss/crossentropy": 2.164547324180603, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21518608927726746, "step": 3640 }, { "epoch": 0.227625, "grad_norm": 2.5, "grad_norm_var": 0.0202056884765625, "learning_rate": 0.0001, "loss": 7.6245, "loss/crossentropy": 2.3133569955825806, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2465798780322075, "step": 3642 }, { "epoch": 0.22775, "grad_norm": 2.359375, "grad_norm_var": 0.02066650390625, "learning_rate": 0.0001, "loss": 7.5408, "loss/crossentropy": 2.3382097482681274, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22838010638952255, "step": 3644 }, { "epoch": 0.227875, "grad_norm": 2.484375, "grad_norm_var": 0.0206695556640625, "learning_rate": 0.0001, "loss": 7.4479, "loss/crossentropy": 2.2210439443588257, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21334365755319595, "step": 3646 }, { "epoch": 0.228, "grad_norm": 2.3125, "grad_norm_var": 0.019310506184895833, "learning_rate": 0.0001, "loss": 7.5992, "loss/crossentropy": 2.1229522228240967, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23556266725063324, "step": 3648 }, { "epoch": 0.228125, "grad_norm": 2.421875, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 7.5133, "loss/crossentropy": 2.335044503211975, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.25041337311267853, "step": 3650 }, { "epoch": 0.22825, "grad_norm": 2.4375, "grad_norm_var": 0.01851806640625, "learning_rate": 0.0001, "loss": 7.5085, "loss/crossentropy": 2.1980998516082764, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21712280064821243, "step": 3652 }, { "epoch": 0.228375, "grad_norm": 2.53125, "grad_norm_var": 0.021100870768229165, "learning_rate": 0.0001, "loss": 7.7472, "loss/crossentropy": 2.513595938682556, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2537487596273422, "step": 3654 }, { "epoch": 0.2285, "grad_norm": 2.484375, "grad_norm_var": 0.019722493489583333, "learning_rate": 0.0001, "loss": 7.515, "loss/crossentropy": 2.3871147632598877, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24276654422283173, "step": 3656 }, { "epoch": 0.228625, "grad_norm": 2.390625, "grad_norm_var": 0.021703084309895832, "learning_rate": 0.0001, "loss": 7.5239, "loss/crossentropy": 2.449997901916504, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23972835391759872, "step": 3658 }, { "epoch": 0.22875, "grad_norm": 2.1875, "grad_norm_var": 0.023957316080729166, "learning_rate": 0.0001, "loss": 7.3961, "loss/crossentropy": 1.9787690043449402, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21979594230651855, "step": 3660 }, { "epoch": 0.228875, "grad_norm": 2.234375, "grad_norm_var": 0.030777994791666666, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.327951431274414, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2144240364432335, "step": 3662 }, { "epoch": 0.229, "grad_norm": 2.28125, "grad_norm_var": 0.028954060872395833, "learning_rate": 0.0001, "loss": 7.4867, "loss/crossentropy": 2.2228533029556274, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2117415815591812, "step": 3664 }, { "epoch": 0.229125, "grad_norm": 2.40625, "grad_norm_var": 0.030817667643229168, "learning_rate": 0.0001, "loss": 7.5446, "loss/crossentropy": 2.5074501037597656, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2416432872414589, "step": 3666 }, { "epoch": 0.22925, "grad_norm": 2.671875, "grad_norm_var": 0.023502604166666666, "learning_rate": 0.0001, "loss": 7.5777, "loss/crossentropy": 2.1035314798355103, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21771013736724854, "step": 3668 }, { "epoch": 0.229375, "grad_norm": 2.484375, "grad_norm_var": 0.026463826497395832, "learning_rate": 0.0001, "loss": 7.5223, "loss/crossentropy": 2.156371831893921, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2226782739162445, "step": 3670 }, { "epoch": 0.2295, "grad_norm": 2.34375, "grad_norm_var": 0.0279296875, "learning_rate": 0.0001, "loss": 7.5712, "loss/crossentropy": 2.3843045234680176, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23753250390291214, "step": 3672 }, { "epoch": 0.229625, "grad_norm": 2.4375, "grad_norm_var": 0.024128214518229166, "learning_rate": 0.0001, "loss": 7.6631, "loss/crossentropy": 2.2664679288864136, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23596899211406708, "step": 3674 }, { "epoch": 0.22975, "grad_norm": 2.5625, "grad_norm_var": 0.024051920572916666, "learning_rate": 0.0001, "loss": 7.3774, "loss/crossentropy": 2.0786343812942505, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21789100021123886, "step": 3676 }, { "epoch": 0.229875, "grad_norm": 2.296875, "grad_norm_var": 0.0205230712890625, "learning_rate": 0.0001, "loss": 7.5573, "loss/crossentropy": 2.148313283920288, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22734209895133972, "step": 3678 }, { "epoch": 0.23, "grad_norm": 2.453125, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 7.5015, "loss/crossentropy": 2.3508098125457764, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22268230468034744, "step": 3680 }, { "epoch": 0.230125, "grad_norm": 2.296875, "grad_norm_var": 0.018773396809895832, "learning_rate": 0.0001, "loss": 7.5282, "loss/crossentropy": 2.419388175010681, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2422405481338501, "step": 3682 }, { "epoch": 0.23025, "grad_norm": 2.453125, "grad_norm_var": 0.0137603759765625, "learning_rate": 0.0001, "loss": 7.5899, "loss/crossentropy": 2.2298837900161743, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21447760611772537, "step": 3684 }, { "epoch": 0.230375, "grad_norm": 2.546875, "grad_norm_var": 0.013605753580729166, "learning_rate": 0.0001, "loss": 7.59, "loss/crossentropy": 2.4269654750823975, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24916177242994308, "step": 3686 }, { "epoch": 0.2305, "grad_norm": 2.546875, "grad_norm_var": 0.010431925455729166, "learning_rate": 0.0001, "loss": 7.4829, "loss/crossentropy": 1.9996501207351685, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23489703238010406, "step": 3688 }, { "epoch": 0.230625, "grad_norm": 2.34375, "grad_norm_var": 0.014469401041666666, "learning_rate": 0.0001, "loss": 7.7753, "loss/crossentropy": 2.3211807012557983, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.237454354763031, "step": 3690 }, { "epoch": 0.23075, "grad_norm": 2.5, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 7.5813, "loss/crossentropy": 2.1937203407287598, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2389645278453827, "step": 3692 }, { "epoch": 0.230875, "grad_norm": 2.546875, "grad_norm_var": 0.01240234375, "learning_rate": 0.0001, "loss": 7.7269, "loss/crossentropy": 2.413442850112915, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24230879545211792, "step": 3694 }, { "epoch": 0.231, "grad_norm": 2.40625, "grad_norm_var": 0.0149078369140625, "learning_rate": 0.0001, "loss": 7.5778, "loss/crossentropy": 2.252353072166443, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22742437571287155, "step": 3696 }, { "epoch": 0.231125, "grad_norm": 2.703125, "grad_norm_var": 0.020670572916666668, "learning_rate": 0.0001, "loss": 7.6338, "loss/crossentropy": 2.2866071462631226, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22707533836364746, "step": 3698 }, { "epoch": 0.23125, "grad_norm": 2.359375, "grad_norm_var": 0.023470052083333335, "learning_rate": 0.0001, "loss": 7.4712, "loss/crossentropy": 1.939712941646576, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23170647770166397, "step": 3700 }, { "epoch": 0.231375, "grad_norm": 2.515625, "grad_norm_var": 0.022196451822916668, "learning_rate": 0.0001, "loss": 7.5799, "loss/crossentropy": 1.7738837003707886, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24364864826202393, "step": 3702 }, { "epoch": 0.2315, "grad_norm": 2.640625, "grad_norm_var": 0.0230865478515625, "learning_rate": 0.0001, "loss": 7.5375, "loss/crossentropy": 2.1018226742744446, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22683311998844147, "step": 3704 }, { "epoch": 0.231625, "grad_norm": 2.515625, "grad_norm_var": 0.0196197509765625, "learning_rate": 0.0001, "loss": 7.6113, "loss/crossentropy": 2.2435100078582764, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22194090485572815, "step": 3706 }, { "epoch": 0.23175, "grad_norm": 2.390625, "grad_norm_var": 0.019147745768229165, "learning_rate": 0.0001, "loss": 7.5735, "loss/crossentropy": 2.114552319049835, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21233541518449783, "step": 3708 }, { "epoch": 0.231875, "grad_norm": 2.46875, "grad_norm_var": 0.020164998372395833, "learning_rate": 0.0001, "loss": 7.4842, "loss/crossentropy": 2.340905785560608, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23916912078857422, "step": 3710 }, { "epoch": 0.232, "grad_norm": 2.65625, "grad_norm_var": 0.018675740559895834, "learning_rate": 0.0001, "loss": 7.5672, "loss/crossentropy": 2.2121574878692627, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22300127893686295, "step": 3712 }, { "epoch": 0.232125, "grad_norm": 2.28125, "grad_norm_var": 0.015852864583333334, "learning_rate": 0.0001, "loss": 7.5457, "loss/crossentropy": 2.350710391998291, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2501726374030113, "step": 3714 }, { "epoch": 0.23225, "grad_norm": 2.515625, "grad_norm_var": 0.015983072916666667, "learning_rate": 0.0001, "loss": 7.5465, "loss/crossentropy": 2.4245604276657104, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22501108050346375, "step": 3716 }, { "epoch": 0.232375, "grad_norm": 2.3125, "grad_norm_var": 0.017215983072916666, "learning_rate": 0.0001, "loss": 7.6819, "loss/crossentropy": 2.2433913946151733, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23217583447694778, "step": 3718 }, { "epoch": 0.2325, "grad_norm": 2.21875, "grad_norm_var": 0.017194620768229165, "learning_rate": 0.0001, "loss": 7.462, "loss/crossentropy": 2.203797459602356, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21597397327423096, "step": 3720 }, { "epoch": 0.232625, "grad_norm": 2.375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 7.4907, "loss/crossentropy": 2.274090051651001, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22629251331090927, "step": 3722 }, { "epoch": 0.23275, "grad_norm": 2.515625, "grad_norm_var": 0.016402180989583334, "learning_rate": 0.0001, "loss": 7.5762, "loss/crossentropy": 2.1787149906158447, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21793190389871597, "step": 3724 }, { "epoch": 0.232875, "grad_norm": 2.359375, "grad_norm_var": 0.018001302083333334, "learning_rate": 0.0001, "loss": 7.5742, "loss/crossentropy": 2.472353219985962, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2537408918142319, "step": 3726 }, { "epoch": 0.233, "grad_norm": 2.546875, "grad_norm_var": 0.012376912434895833, "learning_rate": 0.0001, "loss": 7.7366, "loss/crossentropy": 2.1297940015792847, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24158430099487305, "step": 3728 }, { "epoch": 0.233125, "grad_norm": 2.375, "grad_norm_var": 0.0106109619140625, "learning_rate": 0.0001, "loss": 7.5799, "loss/crossentropy": 2.307152509689331, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2116793394088745, "step": 3730 }, { "epoch": 0.23325, "grad_norm": 2.484375, "grad_norm_var": 0.010465494791666667, "learning_rate": 0.0001, "loss": 7.4555, "loss/crossentropy": 1.9622553586959839, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22563175857067108, "step": 3732 }, { "epoch": 0.233375, "grad_norm": 2.46875, "grad_norm_var": 0.011637369791666666, "learning_rate": 0.0001, "loss": 7.558, "loss/crossentropy": 2.1677664518356323, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22453109174966812, "step": 3734 }, { "epoch": 0.2335, "grad_norm": 2.578125, "grad_norm_var": 0.09955952962239584, "learning_rate": 0.0001, "loss": 7.7388, "loss/crossentropy": 2.192821502685547, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23431353271007538, "step": 3736 }, { "epoch": 0.233625, "grad_norm": 2.53125, "grad_norm_var": 0.10053609212239584, "learning_rate": 0.0001, "loss": 7.5906, "loss/crossentropy": 2.1779892444610596, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23049022257328033, "step": 3738 }, { "epoch": 0.23375, "grad_norm": 2.28125, "grad_norm_var": 0.10708719889322917, "learning_rate": 0.0001, "loss": 7.4549, "loss/crossentropy": 2.402343273162842, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2568327337503433, "step": 3740 }, { "epoch": 0.233875, "grad_norm": 2.359375, "grad_norm_var": 0.10690104166666667, "learning_rate": 0.0001, "loss": 7.5119, "loss/crossentropy": 2.1869853734970093, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2425551563501358, "step": 3742 }, { "epoch": 0.234, "grad_norm": 2.375, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 7.5073, "loss/crossentropy": 2.2516770362854004, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2116701528429985, "step": 3744 }, { "epoch": 0.234125, "grad_norm": 2.484375, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 7.5986, "loss/crossentropy": 2.005309283733368, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20876743644475937, "step": 3746 }, { "epoch": 0.23425, "grad_norm": 2.4375, "grad_norm_var": 0.10966695149739583, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.5279784202575684, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23321525007486343, "step": 3748 }, { "epoch": 0.234375, "grad_norm": 2.453125, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 7.5474, "loss/crossentropy": 2.34593665599823, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23283468186855316, "step": 3750 }, { "epoch": 0.2345, "grad_norm": 2.1875, "grad_norm_var": 0.024665323893229167, "learning_rate": 0.0001, "loss": 7.4105, "loss/crossentropy": 2.249088764190674, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24345380067825317, "step": 3752 }, { "epoch": 0.234625, "grad_norm": 2.328125, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 7.5255, "loss/crossentropy": 2.2118486166000366, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2435833364725113, "step": 3754 }, { "epoch": 0.23475, "grad_norm": 2.59375, "grad_norm_var": 0.017411295572916666, "learning_rate": 0.0001, "loss": 7.6165, "loss/crossentropy": 2.2884024381637573, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23158583790063858, "step": 3756 }, { "epoch": 0.234875, "grad_norm": 2.609375, "grad_norm_var": 0.0189361572265625, "learning_rate": 0.0001, "loss": 7.7033, "loss/crossentropy": 2.2991076707839966, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23600934445858002, "step": 3758 }, { "epoch": 0.235, "grad_norm": 2.1875, "grad_norm_var": 0.020589192708333332, "learning_rate": 0.0001, "loss": 7.4587, "loss/crossentropy": 2.1402446627616882, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.21911373734474182, "step": 3760 }, { "epoch": 0.235125, "grad_norm": 2.453125, "grad_norm_var": 0.0215728759765625, "learning_rate": 0.0001, "loss": 7.6719, "loss/crossentropy": 2.234601616859436, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.22324847429990768, "step": 3762 }, { "epoch": 0.23525, "grad_norm": 2.34375, "grad_norm_var": 0.022412109375, "learning_rate": 0.0001, "loss": 7.5913, "loss/crossentropy": 2.2850943207740784, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22513452172279358, "step": 3764 }, { "epoch": 0.235375, "grad_norm": 2.3125, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 7.6234, "loss/crossentropy": 2.122738838195801, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22072789072990417, "step": 3766 }, { "epoch": 0.2355, "grad_norm": 2.421875, "grad_norm_var": 0.0145172119140625, "learning_rate": 0.0001, "loss": 7.5619, "loss/crossentropy": 2.4355037212371826, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.24772146344184875, "step": 3768 }, { "epoch": 0.235625, "grad_norm": 2.453125, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0001, "loss": 7.5071, "loss/crossentropy": 1.8801981806755066, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22862768173217773, "step": 3770 }, { "epoch": 0.23575, "grad_norm": 2.640625, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 7.4754, "loss/crossentropy": 2.2766274213790894, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2313675582408905, "step": 3772 }, { "epoch": 0.235875, "grad_norm": 2.265625, "grad_norm_var": 0.018610636393229168, "learning_rate": 0.0001, "loss": 7.5267, "loss/crossentropy": 2.0864307284355164, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2242383509874344, "step": 3774 }, { "epoch": 0.236, "grad_norm": 2.390625, "grad_norm_var": 0.01572265625, "learning_rate": 0.0001, "loss": 7.4776, "loss/crossentropy": 2.2763630151748657, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24077893048524857, "step": 3776 }, { "epoch": 0.236125, "grad_norm": 2.375, "grad_norm_var": 0.014127604166666667, "learning_rate": 0.0001, "loss": 7.5641, "loss/crossentropy": 2.2426193952560425, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2291230633854866, "step": 3778 }, { "epoch": 0.23625, "grad_norm": 2.28125, "grad_norm_var": 0.015558878580729166, "learning_rate": 0.0001, "loss": 7.4933, "loss/crossentropy": 2.081269860267639, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25176841020584106, "step": 3780 }, { "epoch": 0.236375, "grad_norm": 6.09375, "grad_norm_var": 1.8935546875, "learning_rate": 0.0001, "loss": 7.8648, "loss/crossentropy": 2.011550545692444, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25546982884407043, "step": 3782 }, { "epoch": 0.2365, "grad_norm": 2.59375, "grad_norm_var": 1.8919993082682292, "learning_rate": 0.0001, "loss": 7.9057, "loss/crossentropy": 2.4982590675354004, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2651009112596512, "step": 3784 }, { "epoch": 0.236625, "grad_norm": 2.71875, "grad_norm_var": 1.8650950113932292, "learning_rate": 0.0001, "loss": 7.6646, "loss/crossentropy": 2.2798246145248413, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22900397330522537, "step": 3786 }, { "epoch": 0.23675, "grad_norm": 2.390625, "grad_norm_var": 1.87945556640625, "learning_rate": 0.0001, "loss": 7.379, "loss/crossentropy": 2.03772509098053, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22353503108024597, "step": 3788 }, { "epoch": 0.236875, "grad_norm": 2.734375, "grad_norm_var": 1.82320556640625, "learning_rate": 0.0001, "loss": 7.4055, "loss/crossentropy": 2.317867159843445, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23167508840560913, "step": 3790 }, { "epoch": 0.237, "grad_norm": 2.1875, "grad_norm_var": 1.84205322265625, "learning_rate": 0.0001, "loss": 7.4846, "loss/crossentropy": 2.010311722755432, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20548687875270844, "step": 3792 }, { "epoch": 0.237125, "grad_norm": 2.359375, "grad_norm_var": 1.8334625244140625, "learning_rate": 0.0001, "loss": 7.5309, "loss/crossentropy": 2.325111150741577, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2562504708766937, "step": 3794 }, { "epoch": 0.23725, "grad_norm": 2.328125, "grad_norm_var": 1.8178131103515625, "learning_rate": 0.0001, "loss": 7.5873, "loss/crossentropy": 2.0214288234710693, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2158362790942192, "step": 3796 }, { "epoch": 0.237375, "grad_norm": 2.390625, "grad_norm_var": 0.1085845947265625, "learning_rate": 0.0001, "loss": 7.6152, "loss/crossentropy": 2.025280773639679, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2420121431350708, "step": 3798 }, { "epoch": 0.2375, "grad_norm": 2.453125, "grad_norm_var": 0.025788370768229166, "learning_rate": 0.0001, "loss": 7.5203, "loss/crossentropy": 2.2576816082000732, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24750325828790665, "step": 3800 }, { "epoch": 0.237625, "grad_norm": 2.515625, "grad_norm_var": 0.019807942708333335, "learning_rate": 0.0001, "loss": 7.562, "loss/crossentropy": 2.4177106618881226, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2321905866265297, "step": 3802 }, { "epoch": 0.23775, "grad_norm": 2.25, "grad_norm_var": 0.017560831705729165, "learning_rate": 0.0001, "loss": 7.6246, "loss/crossentropy": 2.381041169166565, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22734872996807098, "step": 3804 }, { "epoch": 0.237875, "grad_norm": 2.1875, "grad_norm_var": 0.014777628580729167, "learning_rate": 0.0001, "loss": 7.2978, "loss/crossentropy": 2.169810175895691, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21825361251831055, "step": 3806 }, { "epoch": 0.238, "grad_norm": 2.703125, "grad_norm_var": 0.01949462890625, "learning_rate": 0.0001, "loss": 7.6745, "loss/crossentropy": 2.2910315990448, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2431943118572235, "step": 3808 }, { "epoch": 0.238125, "grad_norm": 2.234375, "grad_norm_var": 0.020799763997395835, "learning_rate": 0.0001, "loss": 7.4715, "loss/crossentropy": 2.32381534576416, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21884576976299286, "step": 3810 }, { "epoch": 0.23825, "grad_norm": 2.40625, "grad_norm_var": 0.02027587890625, "learning_rate": 0.0001, "loss": 7.626, "loss/crossentropy": 2.3181525468826294, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23518574982881546, "step": 3812 }, { "epoch": 0.238375, "grad_norm": 2.328125, "grad_norm_var": 0.021712239583333334, "learning_rate": 0.0001, "loss": 7.718, "loss/crossentropy": 2.4021564722061157, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22601833939552307, "step": 3814 }, { "epoch": 0.2385, "grad_norm": 2.671875, "grad_norm_var": 0.026253255208333333, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.241186261177063, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22214636206626892, "step": 3816 }, { "epoch": 0.238625, "grad_norm": 2.203125, "grad_norm_var": 0.028076171875, "learning_rate": 0.0001, "loss": 7.4295, "loss/crossentropy": 2.3284627199172974, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23751358687877655, "step": 3818 }, { "epoch": 0.23875, "grad_norm": 2.328125, "grad_norm_var": 0.025634765625, "learning_rate": 0.0001, "loss": 7.3839, "loss/crossentropy": 1.878059983253479, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.20184766501188278, "step": 3820 }, { "epoch": 0.238875, "grad_norm": 2.296875, "grad_norm_var": 0.021415201822916667, "learning_rate": 0.0001, "loss": 7.4835, "loss/crossentropy": 2.2008787393569946, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2629189044237137, "step": 3822 }, { "epoch": 0.239, "grad_norm": 2.5625, "grad_norm_var": 0.016803995768229166, "learning_rate": 0.0001, "loss": 7.4507, "loss/crossentropy": 2.3513262271881104, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22654324024915695, "step": 3824 }, { "epoch": 0.239125, "grad_norm": 2.640625, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 7.4914, "loss/crossentropy": 2.05319607257843, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21194881200790405, "step": 3826 }, { "epoch": 0.23925, "grad_norm": 2.296875, "grad_norm_var": 0.020882161458333333, "learning_rate": 0.0001, "loss": 7.4726, "loss/crossentropy": 2.274933695793152, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24475091695785522, "step": 3828 }, { "epoch": 0.239375, "grad_norm": 2.625, "grad_norm_var": 0.02076416015625, "learning_rate": 0.0001, "loss": 7.4758, "loss/crossentropy": 2.2083067893981934, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21219877898693085, "step": 3830 }, { "epoch": 0.2395, "grad_norm": 2.5, "grad_norm_var": 0.0187164306640625, "learning_rate": 0.0001, "loss": 7.6724, "loss/crossentropy": 2.3150436878204346, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2536996081471443, "step": 3832 }, { "epoch": 0.239625, "grad_norm": 2.53125, "grad_norm_var": 0.014557902018229167, "learning_rate": 0.0001, "loss": 7.569, "loss/crossentropy": 2.2580004930496216, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21672644466161728, "step": 3834 }, { "epoch": 0.23975, "grad_norm": 2.375, "grad_norm_var": 0.014354451497395834, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.080252170562744, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22742826491594315, "step": 3836 }, { "epoch": 0.239875, "grad_norm": 2.375, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 7.3651, "loss/crossentropy": 2.1281388998031616, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2401324361562729, "step": 3838 }, { "epoch": 0.24, "grad_norm": 2.703125, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.2224199771881104, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23487379401922226, "step": 3840 }, { "epoch": 0.240125, "grad_norm": 2.421875, "grad_norm_var": 0.04025065104166667, "learning_rate": 0.0001, "loss": 7.632, "loss/crossentropy": 2.424517512321472, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24695612490177155, "step": 3842 }, { "epoch": 0.24025, "grad_norm": 2.75, "grad_norm_var": 0.04254557291666667, "learning_rate": 0.0001, "loss": 7.5674, "loss/crossentropy": 2.1599162220954895, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21321508288383484, "step": 3844 }, { "epoch": 0.240375, "grad_norm": 2.59375, "grad_norm_var": 0.0422271728515625, "learning_rate": 0.0001, "loss": 7.6312, "loss/crossentropy": 2.0522512793540955, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21054793149232864, "step": 3846 }, { "epoch": 0.2405, "grad_norm": 2.46875, "grad_norm_var": 0.04182840983072917, "learning_rate": 0.0001, "loss": 7.6148, "loss/crossentropy": 2.3739218711853027, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24357128143310547, "step": 3848 }, { "epoch": 0.240625, "grad_norm": 2.125, "grad_norm_var": 0.050309244791666666, "learning_rate": 0.0001, "loss": 7.4666, "loss/crossentropy": 2.1554529666900635, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21792703121900558, "step": 3850 }, { "epoch": 0.24075, "grad_norm": 2.59375, "grad_norm_var": 0.05252278645833333, "learning_rate": 0.0001, "loss": 7.5105, "loss/crossentropy": 2.5243422985076904, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23979288339614868, "step": 3852 }, { "epoch": 0.240875, "grad_norm": 2.21875, "grad_norm_var": 0.054032389322916666, "learning_rate": 0.0001, "loss": 7.6535, "loss/crossentropy": 2.169221580028534, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21038363128900528, "step": 3854 }, { "epoch": 0.241, "grad_norm": 2.34375, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 7.4283, "loss/crossentropy": 2.375051498413086, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24313046038150787, "step": 3856 }, { "epoch": 0.241125, "grad_norm": 2.453125, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 2.504861831665039, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2226250097155571, "step": 3858 }, { "epoch": 0.24125, "grad_norm": 2.171875, "grad_norm_var": 0.022347005208333333, "learning_rate": 0.0001, "loss": 7.4035, "loss/crossentropy": 2.188872456550598, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22465746104717255, "step": 3860 }, { "epoch": 0.241375, "grad_norm": 2.34375, "grad_norm_var": 0.018831380208333335, "learning_rate": 0.0001, "loss": 7.4832, "loss/crossentropy": 2.109761595726013, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21876226365566254, "step": 3862 }, { "epoch": 0.2415, "grad_norm": 2.34375, "grad_norm_var": 0.0161041259765625, "learning_rate": 0.0001, "loss": 7.4889, "loss/crossentropy": 2.454153060913086, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2234111726284027, "step": 3864 }, { "epoch": 0.241625, "grad_norm": 2.359375, "grad_norm_var": 0.0175445556640625, "learning_rate": 0.0001, "loss": 7.5401, "loss/crossentropy": 2.2534934282302856, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.20882034301757812, "step": 3866 }, { "epoch": 0.24175, "grad_norm": 2.234375, "grad_norm_var": 0.01861572265625, "learning_rate": 0.0001, "loss": 7.3884, "loss/crossentropy": 2.46646249294281, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23306798189878464, "step": 3868 }, { "epoch": 0.241875, "grad_norm": 2.46875, "grad_norm_var": 0.01939697265625, "learning_rate": 0.0001, "loss": 7.4742, "loss/crossentropy": 2.286616086959839, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23071034252643585, "step": 3870 }, { "epoch": 0.242, "grad_norm": 2.203125, "grad_norm_var": 0.020002237955729165, "learning_rate": 0.0001, "loss": 7.4734, "loss/crossentropy": 2.3127135038375854, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2306494191288948, "step": 3872 }, { "epoch": 0.242125, "grad_norm": 2.34375, "grad_norm_var": 0.018636067708333332, "learning_rate": 0.0001, "loss": 7.3801, "loss/crossentropy": 2.292248845100403, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21103744953870773, "step": 3874 }, { "epoch": 0.24225, "grad_norm": 2.53125, "grad_norm_var": 0.020799763997395835, "learning_rate": 0.0001, "loss": 7.7475, "loss/crossentropy": 2.2768986225128174, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25374574959278107, "step": 3876 }, { "epoch": 0.242375, "grad_norm": 2.640625, "grad_norm_var": 0.02554931640625, "learning_rate": 0.0001, "loss": 7.4706, "loss/crossentropy": 2.335216999053955, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2206215113401413, "step": 3878 }, { "epoch": 0.2425, "grad_norm": 2.203125, "grad_norm_var": 0.027098592122395834, "learning_rate": 0.0001, "loss": 7.6204, "loss/crossentropy": 2.3161808252334595, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23353615403175354, "step": 3880 }, { "epoch": 0.242625, "grad_norm": 2.453125, "grad_norm_var": 0.024853515625, "learning_rate": 0.0001, "loss": 7.4924, "loss/crossentropy": 2.1703152656555176, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23688945174217224, "step": 3882 }, { "epoch": 0.24275, "grad_norm": 2.78125, "grad_norm_var": 0.027176920572916666, "learning_rate": 0.0001, "loss": 7.529, "loss/crossentropy": 2.342598557472229, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21841312944889069, "step": 3884 }, { "epoch": 0.242875, "grad_norm": 2.640625, "grad_norm_var": 0.09791259765625, "learning_rate": 0.0001, "loss": 7.3372, "loss/crossentropy": 2.1650543808937073, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22078751772642136, "step": 3886 }, { "epoch": 0.243, "grad_norm": 2.140625, "grad_norm_var": 0.11746419270833333, "learning_rate": 0.0001, "loss": 7.4851, "loss/crossentropy": 1.997887134552002, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21081900596618652, "step": 3888 }, { "epoch": 0.243125, "grad_norm": 2.5, "grad_norm_var": 0.11201070149739584, "learning_rate": 0.0001, "loss": 7.6291, "loss/crossentropy": 2.2321996688842773, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22545063495635986, "step": 3890 }, { "epoch": 0.24325, "grad_norm": 2.578125, "grad_norm_var": 0.11261393229166666, "learning_rate": 0.0001, "loss": 7.5623, "loss/crossentropy": 2.522809147834778, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24426687508821487, "step": 3892 }, { "epoch": 0.243375, "grad_norm": 2.1875, "grad_norm_var": 0.11669921875, "learning_rate": 0.0001, "loss": 7.4586, "loss/crossentropy": 2.3665112257003784, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.253150999546051, "step": 3894 }, { "epoch": 0.2435, "grad_norm": 2.78125, "grad_norm_var": 0.11687825520833334, "learning_rate": 0.0001, "loss": 7.5531, "loss/crossentropy": 2.4039018154144287, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22742793709039688, "step": 3896 }, { "epoch": 0.243625, "grad_norm": 2.125, "grad_norm_var": 0.1285552978515625, "learning_rate": 0.0001, "loss": 7.5957, "loss/crossentropy": 2.3447670936584473, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2258382812142372, "step": 3898 }, { "epoch": 0.24375, "grad_norm": 2.390625, "grad_norm_var": 0.12775065104166666, "learning_rate": 0.0001, "loss": 7.3161, "loss/crossentropy": 2.020743668079376, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2295529618859291, "step": 3900 }, { "epoch": 0.243875, "grad_norm": 2.4375, "grad_norm_var": 0.045685831705729166, "learning_rate": 0.0001, "loss": 7.4793, "loss/crossentropy": 2.0840115547180176, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21463429182767868, "step": 3902 }, { "epoch": 0.244, "grad_norm": 2.671875, "grad_norm_var": 0.03689778645833333, "learning_rate": 0.0001, "loss": 7.7833, "loss/crossentropy": 2.3722068071365356, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22333616763353348, "step": 3904 }, { "epoch": 0.244125, "grad_norm": 2.40625, "grad_norm_var": 0.038605753580729166, "learning_rate": 0.0001, "loss": 7.4049, "loss/crossentropy": 2.5307204723358154, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23095162957906723, "step": 3906 }, { "epoch": 0.24425, "grad_norm": 2.34375, "grad_norm_var": 0.036554972330729164, "learning_rate": 0.0001, "loss": 7.5943, "loss/crossentropy": 2.427748680114746, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2420305386185646, "step": 3908 }, { "epoch": 0.244375, "grad_norm": 2.375, "grad_norm_var": 0.033984375, "learning_rate": 0.0001, "loss": 7.5523, "loss/crossentropy": 2.3058314323425293, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.25578539073467255, "step": 3910 }, { "epoch": 0.2445, "grad_norm": 2.28125, "grad_norm_var": 0.0274566650390625, "learning_rate": 0.0001, "loss": 7.2921, "loss/crossentropy": 1.9724953174591064, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.19885031133890152, "step": 3912 }, { "epoch": 0.244625, "grad_norm": 2.390625, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 7.6238, "loss/crossentropy": 2.1780654191970825, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2180589661002159, "step": 3914 }, { "epoch": 0.24475, "grad_norm": 2.375, "grad_norm_var": 0.017455037434895834, "learning_rate": 0.0001, "loss": 7.4598, "loss/crossentropy": 2.246911883354187, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22257865220308304, "step": 3916 }, { "epoch": 0.244875, "grad_norm": 2.484375, "grad_norm_var": 0.01953125, "learning_rate": 0.0001, "loss": 7.5233, "loss/crossentropy": 2.1807644367218018, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23529939353466034, "step": 3918 }, { "epoch": 0.245, "grad_norm": 2.421875, "grad_norm_var": 0.015241495768229167, "learning_rate": 0.0001, "loss": 7.6941, "loss/crossentropy": 2.4168232679367065, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24450047314167023, "step": 3920 }, { "epoch": 0.245125, "grad_norm": 2.640625, "grad_norm_var": 0.015836588541666665, "learning_rate": 0.0001, "loss": 7.3875, "loss/crossentropy": 2.1456319093704224, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22022899985313416, "step": 3922 }, { "epoch": 0.24525, "grad_norm": 2.46875, "grad_norm_var": 0.017183430989583335, "learning_rate": 0.0001, "loss": 7.5178, "loss/crossentropy": 2.1945712566375732, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23692822456359863, "step": 3924 }, { "epoch": 0.245375, "grad_norm": 2.25, "grad_norm_var": 0.017943318684895834, "learning_rate": 0.0001, "loss": 7.5715, "loss/crossentropy": 2.334246516227722, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24413339793682098, "step": 3926 }, { "epoch": 0.2455, "grad_norm": 2.40625, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 2.052259385585785, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21555544435977936, "step": 3928 }, { "epoch": 0.245625, "grad_norm": 2.46875, "grad_norm_var": 0.0144195556640625, "learning_rate": 0.0001, "loss": 7.6389, "loss/crossentropy": 2.4917489290237427, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24890758097171783, "step": 3930 }, { "epoch": 0.24575, "grad_norm": 2.359375, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 7.5094, "loss/crossentropy": 2.3055362701416016, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2225131392478943, "step": 3932 }, { "epoch": 0.245875, "grad_norm": 2.40625, "grad_norm_var": 0.013472493489583333, "learning_rate": 0.0001, "loss": 7.4905, "loss/crossentropy": 2.1624478101730347, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2125004380941391, "step": 3934 }, { "epoch": 0.246, "grad_norm": 2.328125, "grad_norm_var": 0.013997395833333334, "learning_rate": 0.0001, "loss": 7.4723, "loss/crossentropy": 2.408942222595215, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23912448436021805, "step": 3936 }, { "epoch": 0.246125, "grad_norm": 2.234375, "grad_norm_var": 0.012760416666666666, "learning_rate": 0.0001, "loss": 7.2563, "loss/crossentropy": 2.1905765533447266, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22357414662837982, "step": 3938 }, { "epoch": 0.24625, "grad_norm": 2.640625, "grad_norm_var": 0.0129547119140625, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.3154603242874146, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23004180938005447, "step": 3940 }, { "epoch": 0.246375, "grad_norm": 2.3125, "grad_norm_var": 0.011620076497395833, "learning_rate": 0.0001, "loss": 7.4689, "loss/crossentropy": 2.5023341178894043, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24651438742876053, "step": 3942 }, { "epoch": 0.2465, "grad_norm": 2.34375, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 7.4688, "loss/crossentropy": 2.0356882214546204, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.20058569312095642, "step": 3944 }, { "epoch": 0.246625, "grad_norm": 2.5, "grad_norm_var": 0.010791015625, "learning_rate": 0.0001, "loss": 7.5573, "loss/crossentropy": 2.4191837310791016, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23377195000648499, "step": 3946 }, { "epoch": 0.24675, "grad_norm": 2.5, "grad_norm_var": 0.010628255208333333, "learning_rate": 0.0001, "loss": 7.6407, "loss/crossentropy": 2.3417128324508667, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22980307787656784, "step": 3948 }, { "epoch": 0.246875, "grad_norm": 2.5, "grad_norm_var": 0.014143880208333333, "learning_rate": 0.0001, "loss": 7.5853, "loss/crossentropy": 2.3083345890045166, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23690129816532135, "step": 3950 }, { "epoch": 0.247, "grad_norm": 2.34375, "grad_norm_var": 0.017389933268229168, "learning_rate": 0.0001, "loss": 7.5956, "loss/crossentropy": 2.456955909729004, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2354580983519554, "step": 3952 }, { "epoch": 0.247125, "grad_norm": 2.359375, "grad_norm_var": 0.015055338541666666, "learning_rate": 0.0001, "loss": 7.4467, "loss/crossentropy": 2.162124276161194, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23025665432214737, "step": 3954 }, { "epoch": 0.24725, "grad_norm": 2.265625, "grad_norm_var": 0.013337198893229167, "learning_rate": 0.0001, "loss": 7.4083, "loss/crossentropy": 2.203832507133484, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21881967037916183, "step": 3956 }, { "epoch": 0.247375, "grad_norm": 2.453125, "grad_norm_var": 0.012848917643229167, "learning_rate": 0.0001, "loss": 7.4759, "loss/crossentropy": 2.226723313331604, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22026114910840988, "step": 3958 }, { "epoch": 0.2475, "grad_norm": 2.421875, "grad_norm_var": 0.012678019205729167, "learning_rate": 0.0001, "loss": 7.476, "loss/crossentropy": 2.3727807998657227, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.247600257396698, "step": 3960 }, { "epoch": 0.247625, "grad_norm": 2.375, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 7.483, "loss/crossentropy": 2.2924267053604126, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2366592362523079, "step": 3962 }, { "epoch": 0.24775, "grad_norm": 2.21875, "grad_norm_var": 0.013654581705729167, "learning_rate": 0.0001, "loss": 7.3951, "loss/crossentropy": 2.0975323915481567, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2243112102150917, "step": 3964 }, { "epoch": 0.247875, "grad_norm": 2.53125, "grad_norm_var": 0.010553995768229166, "learning_rate": 0.0001, "loss": 7.3489, "loss/crossentropy": 2.21097195148468, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2190333753824234, "step": 3966 }, { "epoch": 0.248, "grad_norm": 2.453125, "grad_norm_var": 0.007840983072916667, "learning_rate": 0.0001, "loss": 7.4721, "loss/crossentropy": 2.2507461309432983, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23200294375419617, "step": 3968 }, { "epoch": 0.248125, "grad_norm": 2.234375, "grad_norm_var": 0.0075592041015625, "learning_rate": 0.0001, "loss": 7.4843, "loss/crossentropy": 2.191147208213806, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23013149946928024, "step": 3970 }, { "epoch": 0.24825, "grad_norm": 2.453125, "grad_norm_var": 0.00748291015625, "learning_rate": 0.0001, "loss": 7.5614, "loss/crossentropy": 2.4555280208587646, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2434312105178833, "step": 3972 }, { "epoch": 0.248375, "grad_norm": 2.375, "grad_norm_var": 0.007157389322916667, "learning_rate": 0.0001, "loss": 7.4256, "loss/crossentropy": 2.2261093854904175, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23195409029722214, "step": 3974 }, { "epoch": 0.2485, "grad_norm": 2.59375, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 7.4072, "loss/crossentropy": 2.1353044509887695, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21157176792621613, "step": 3976 }, { "epoch": 0.248625, "grad_norm": 2.203125, "grad_norm_var": 0.0131988525390625, "learning_rate": 0.0001, "loss": 7.3719, "loss/crossentropy": 2.145276427268982, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21788546442985535, "step": 3978 }, { "epoch": 0.24875, "grad_norm": 2.484375, "grad_norm_var": 0.0127105712890625, "learning_rate": 0.0001, "loss": 7.4473, "loss/crossentropy": 2.37498140335083, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21876709163188934, "step": 3980 }, { "epoch": 0.248875, "grad_norm": 2.578125, "grad_norm_var": 0.01724853515625, "learning_rate": 0.0001, "loss": 7.589, "loss/crossentropy": 2.432854413986206, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2542262375354767, "step": 3982 }, { "epoch": 0.249, "grad_norm": 2.21875, "grad_norm_var": 0.0197418212890625, "learning_rate": 0.0001, "loss": 7.5591, "loss/crossentropy": 2.451367735862732, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22828736156225204, "step": 3984 }, { "epoch": 0.249125, "grad_norm": 2.375, "grad_norm_var": 0.0188629150390625, "learning_rate": 0.0001, "loss": 7.4799, "loss/crossentropy": 2.2237168550491333, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23009125888347626, "step": 3986 }, { "epoch": 0.24925, "grad_norm": 2.25, "grad_norm_var": 0.020703125, "learning_rate": 0.0001, "loss": 7.6225, "loss/crossentropy": 2.0535144805908203, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22887953370809555, "step": 3988 }, { "epoch": 0.249375, "grad_norm": 2.578125, "grad_norm_var": 0.0243804931640625, "learning_rate": 0.0001, "loss": 7.2587, "loss/crossentropy": 2.1713778972625732, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.20236970484256744, "step": 3990 }, { "epoch": 0.2495, "grad_norm": 2.40625, "grad_norm_var": 0.029539998372395834, "learning_rate": 0.0001, "loss": 7.5803, "loss/crossentropy": 2.314449429512024, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2168670818209648, "step": 3992 }, { "epoch": 0.249625, "grad_norm": 2.28125, "grad_norm_var": 0.0293121337890625, "learning_rate": 0.0001, "loss": 7.5327, "loss/crossentropy": 2.2450716495513916, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23649980127811432, "step": 3994 }, { "epoch": 0.24975, "grad_norm": 2.28125, "grad_norm_var": 0.029488118489583333, "learning_rate": 0.0001, "loss": 7.3866, "loss/crossentropy": 2.172006130218506, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21691139042377472, "step": 3996 }, { "epoch": 0.249875, "grad_norm": 2.46875, "grad_norm_var": 0.025321451822916667, "learning_rate": 0.0001, "loss": 7.4429, "loss/crossentropy": 2.230931878089905, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2391601949930191, "step": 3998 }, { "epoch": 0.25, "grad_norm": 2.703125, "grad_norm_var": 0.0368316650390625, "learning_rate": 0.0001, "loss": 7.4992, "loss/crossentropy": 2.2851897478103638, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2219674438238144, "step": 4000 } ], "logging_steps": 2, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.33181242621952e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }