diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,48027 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5, + "eval_steps": 2000, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000125, + "grad_norm": 384.0, + "learning_rate": 1.18e-05, + "loss": 99.3112, + "loss/crossentropy": 9.301286220550537, + "loss/hidden": 16.5625, + "loss/jsd": 0.0, + "loss/logits": 7.171189308166504, + "step": 2 + }, + { + "epoch": 0.00025, + "grad_norm": 388.0, + "learning_rate": 1.3600000000000002e-05, + "loss": 98.5376, + "loss/crossentropy": 9.283345699310303, + "loss/hidden": 16.5625, + "loss/jsd": 0.0, + "loss/logits": 7.322719573974609, + "step": 4 + }, + { + "epoch": 0.000375, + "grad_norm": 380.0, + "learning_rate": 1.54e-05, + "loss": 98.4698, + "loss/crossentropy": 9.26666784286499, + "loss/hidden": 16.625, + "loss/jsd": 0.0, + "loss/logits": 7.094146490097046, + "step": 6 + }, + { + "epoch": 0.0005, + "grad_norm": 187.0, + "learning_rate": 1.72e-05, + "loss": 95.8811, + "loss/crossentropy": 9.060422420501709, + "loss/hidden": 16.5, + "loss/jsd": 0.0, + "loss/logits": 6.9519524574279785, + "step": 8 + }, + { + "epoch": 0.000625, + "grad_norm": 158.0, + "learning_rate": 1.9e-05, + "loss": 91.1537, + "loss/crossentropy": 8.855913162231445, + "loss/hidden": 16.375, + "loss/jsd": 0.0, + "loss/logits": 6.698125123977661, + "step": 10 + }, + { + "epoch": 0.00075, + "grad_norm": 135.0, + "learning_rate": 2.0800000000000004e-05, + "loss": 89.0469, + "loss/crossentropy": 8.479426860809326, + "loss/hidden": 16.3125, + "loss/jsd": 0.0, + "loss/logits": 6.1546266078948975, + "step": 12 + }, + { + "epoch": 0.000875, + "grad_norm": 119.0, + "learning_rate": 2.2600000000000004e-05, + "loss": 87.3701, + "loss/crossentropy": 8.417439937591553, + "loss/hidden": 16.25, + "loss/jsd": 0.0, + "loss/logits": 6.330978155136108, + "step": 14 + }, + { + "epoch": 0.001, + "grad_norm": 98.0, + "grad_norm_var": 15809.7625, + "learning_rate": 2.4400000000000004e-05, + "loss": 81.7839, + "loss/crossentropy": 7.888103723526001, + "loss/hidden": 15.8125, + "loss/jsd": 0.0, + "loss/logits": 5.809406042098999, + "step": 16 + }, + { + "epoch": 0.001125, + "grad_norm": 278.0, + "grad_norm_var": 12072.916666666666, + "learning_rate": 2.6200000000000003e-05, + "loss": 83.0321, + "loss/crossentropy": 7.949460506439209, + "loss/hidden": 15.34375, + "loss/jsd": 0.0, + "loss/logits": 6.00595760345459, + "step": 18 + }, + { + "epoch": 0.00125, + "grad_norm": 67.5, + "grad_norm_var": 8976.948958333332, + "learning_rate": 2.8000000000000003e-05, + "loss": 79.5947, + "loss/crossentropy": 7.64544939994812, + "loss/hidden": 15.25, + "loss/jsd": 0.0, + "loss/logits": 5.5388875007629395, + "step": 20 + }, + { + "epoch": 0.001375, + "grad_norm": 38.5, + "grad_norm_var": 4950.315625, + "learning_rate": 2.9800000000000006e-05, + "loss": 74.6424, + "loss/crossentropy": 7.209100246429443, + "loss/hidden": 15.15625, + "loss/jsd": 0.0, + "loss/logits": 5.076019763946533, + "step": 22 + }, + { + "epoch": 0.0015, + "grad_norm": 54.5, + "grad_norm_var": 4140.295833333334, + "learning_rate": 3.16e-05, + "loss": 71.7249, + "loss/crossentropy": 7.1052405834198, + "loss/hidden": 15.0, + "loss/jsd": 0.0, + "loss/logits": 5.032779216766357, + "step": 24 + }, + { + "epoch": 0.001625, + "grad_norm": 90.5, + "grad_norm_var": 3923.795833333333, + "learning_rate": 3.3400000000000005e-05, + "loss": 69.0909, + "loss/crossentropy": 6.593599557876587, + "loss/hidden": 14.9375, + "loss/jsd": 0.0, + "loss/logits": 4.861028671264648, + "step": 26 + }, + { + "epoch": 0.00175, + "grad_norm": 49.0, + "grad_norm_var": 4052.4239583333333, + "learning_rate": 3.520000000000001e-05, + "loss": 64.7694, + "loss/crossentropy": 6.363184213638306, + "loss/hidden": 14.59375, + "loss/jsd": 0.0, + "loss/logits": 4.430697441101074, + "step": 28 + }, + { + "epoch": 0.001875, + "grad_norm": 47.0, + "grad_norm_var": 4244.118489583333, + "learning_rate": 3.7e-05, + "loss": 59.3223, + "loss/crossentropy": 5.989596843719482, + "loss/hidden": 13.84375, + "loss/jsd": 0.0, + "loss/logits": 4.165619850158691, + "step": 30 + }, + { + "epoch": 0.002, + "grad_norm": 71.0, + "grad_norm_var": 4305.730989583333, + "learning_rate": 3.88e-05, + "loss": 55.1302, + "loss/crossentropy": 5.726909637451172, + "loss/hidden": 13.53125, + "loss/jsd": 0.0, + "loss/logits": 3.7759565114974976, + "step": 32 + }, + { + "epoch": 0.002125, + "grad_norm": 60.0, + "grad_norm_var": 934.0955729166667, + "learning_rate": 4.0600000000000004e-05, + "loss": 50.1945, + "loss/crossentropy": 5.208499431610107, + "loss/hidden": 13.125, + "loss/jsd": 0.0, + "loss/logits": 3.081121802330017, + "step": 34 + }, + { + "epoch": 0.00225, + "grad_norm": 46.75, + "grad_norm_var": 266.72083333333336, + "learning_rate": 4.240000000000001e-05, + "loss": 46.3994, + "loss/crossentropy": 4.913021564483643, + "loss/hidden": 12.375, + "loss/jsd": 0.0, + "loss/logits": 2.866178512573242, + "step": 36 + }, + { + "epoch": 0.002375, + "grad_norm": 51.75, + "grad_norm_var": 237.22395833333334, + "learning_rate": 4.420000000000001e-05, + "loss": 42.1907, + "loss/crossentropy": 4.504716157913208, + "loss/hidden": 12.0625, + "loss/jsd": 0.0, + "loss/logits": 2.6037776470184326, + "step": 38 + }, + { + "epoch": 0.0025, + "grad_norm": 50.0, + "grad_norm_var": 236.25390625, + "learning_rate": 4.600000000000001e-05, + "loss": 39.115, + "loss/crossentropy": 4.373331546783447, + "loss/hidden": 11.375, + "loss/jsd": 0.0, + "loss/logits": 2.2266069650650024, + "step": 40 + }, + { + "epoch": 0.002625, + "grad_norm": 33.0, + "grad_norm_var": 164.56640625, + "learning_rate": 4.78e-05, + "loss": 36.1801, + "loss/crossentropy": 4.276909589767456, + "loss/hidden": 11.0625, + "loss/jsd": 0.0, + "loss/logits": 2.2537089586257935, + "step": 42 + }, + { + "epoch": 0.00275, + "grad_norm": 41.0, + "grad_norm_var": 170.54765625, + "learning_rate": 4.96e-05, + "loss": 33.7672, + "loss/crossentropy": 3.979385256767273, + "loss/hidden": 10.59375, + "loss/jsd": 0.0, + "loss/logits": 1.776978850364685, + "step": 44 + }, + { + "epoch": 0.002875, + "grad_norm": 31.5, + "grad_norm_var": 205.69140625, + "learning_rate": 5.14e-05, + "loss": 31.4663, + "loss/crossentropy": 3.5722849369049072, + "loss/hidden": 10.15625, + "loss/jsd": 0.0, + "loss/logits": 1.7410615682601929, + "step": 46 + }, + { + "epoch": 0.003, + "grad_norm": 21.375, + "grad_norm_var": 211.38020833333334, + "learning_rate": 5.3200000000000006e-05, + "loss": 29.7082, + "loss/crossentropy": 3.679291844367981, + "loss/hidden": 9.625, + "loss/jsd": 0.0, + "loss/logits": 1.594287633895874, + "step": 48 + }, + { + "epoch": 0.003125, + "grad_norm": 23.125, + "grad_norm_var": 105.7416015625, + "learning_rate": 5.500000000000001e-05, + "loss": 28.489, + "loss/crossentropy": 3.9182190895080566, + "loss/hidden": 9.40625, + "loss/jsd": 0.0, + "loss/logits": 1.5025497078895569, + "step": 50 + }, + { + "epoch": 0.00325, + "grad_norm": 29.875, + "grad_norm_var": 105.0306640625, + "learning_rate": 5.680000000000001e-05, + "loss": 27.5703, + "loss/crossentropy": 3.526407241821289, + "loss/hidden": 9.25, + "loss/jsd": 0.0, + "loss/logits": 1.494104266166687, + "step": 52 + }, + { + "epoch": 0.003375, + "grad_norm": 19.625, + "grad_norm_var": 99.2416015625, + "learning_rate": 5.860000000000001e-05, + "loss": 26.1189, + "loss/crossentropy": 3.4616609811782837, + "loss/hidden": 9.0, + "loss/jsd": 0.0, + "loss/logits": 1.3545405268669128, + "step": 54 + }, + { + "epoch": 0.0035, + "grad_norm": 22.5, + "grad_norm_var": 54.81920572916667, + "learning_rate": 6.040000000000001e-05, + "loss": 24.328, + "loss/crossentropy": 3.308198928833008, + "loss/hidden": 8.75, + "loss/jsd": 0.0, + "loss/logits": 1.2083913683891296, + "step": 56 + }, + { + "epoch": 0.003625, + "grad_norm": 14.0625, + "grad_norm_var": 59.79152018229167, + "learning_rate": 6.220000000000001e-05, + "loss": 24.2188, + "loss/crossentropy": 3.5452929735183716, + "loss/hidden": 8.4375, + "loss/jsd": 0.0, + "loss/logits": 1.2204867601394653, + "step": 58 + }, + { + "epoch": 0.00375, + "grad_norm": 15.75, + "grad_norm_var": 52.173177083333336, + "learning_rate": 6.400000000000001e-05, + "loss": 22.8282, + "loss/crossentropy": 3.1143264770507812, + "loss/hidden": 8.40625, + "loss/jsd": 0.0, + "loss/logits": 1.1705525517463684, + "step": 60 + }, + { + "epoch": 0.003875, + "grad_norm": 20.125, + "grad_norm_var": 38.25930989583333, + "learning_rate": 6.58e-05, + "loss": 22.306, + "loss/crossentropy": 3.136604428291321, + "loss/hidden": 7.96875, + "loss/jsd": 0.0, + "loss/logits": 1.1404522061347961, + "step": 62 + }, + { + "epoch": 0.004, + "grad_norm": 16.5, + "grad_norm_var": 40.18274739583333, + "learning_rate": 6.76e-05, + "loss": 21.058, + "loss/crossentropy": 2.9673322439193726, + "loss/hidden": 7.703125, + "loss/jsd": 0.0, + "loss/logits": 1.0015667080879211, + "step": 64 + }, + { + "epoch": 0.004125, + "grad_norm": 11.6875, + "grad_norm_var": 39.946614583333336, + "learning_rate": 6.94e-05, + "loss": 21.0828, + "loss/crossentropy": 3.2232860326766968, + "loss/hidden": 7.546875, + "loss/jsd": 0.0, + "loss/logits": 0.964312881231308, + "step": 66 + }, + { + "epoch": 0.00425, + "grad_norm": 15.4375, + "grad_norm_var": 32.87902018229167, + "learning_rate": 7.120000000000001e-05, + "loss": 20.2688, + "loss/crossentropy": 3.371062755584717, + "loss/hidden": 7.53125, + "loss/jsd": 0.0, + "loss/logits": 0.971402496099472, + "step": 68 + }, + { + "epoch": 0.004375, + "grad_norm": 11.8125, + "grad_norm_var": 37.155322265625, + "learning_rate": 7.3e-05, + "loss": 19.6652, + "loss/crossentropy": 2.8037211894989014, + "loss/hidden": 7.40625, + "loss/jsd": 0.0, + "loss/logits": 0.952717661857605, + "step": 70 + }, + { + "epoch": 0.0045, + "grad_norm": 116.5, + "grad_norm_var": 640.5003743489583, + "learning_rate": 7.48e-05, + "loss": 19.6559, + "loss/crossentropy": 2.9093810319900513, + "loss/hidden": 7.15625, + "loss/jsd": 0.0, + "loss/logits": 0.9704654216766357, + "step": 72 + }, + { + "epoch": 0.004625, + "grad_norm": 9.4375, + "grad_norm_var": 651.0841145833333, + "learning_rate": 7.66e-05, + "loss": 18.7849, + "loss/crossentropy": 2.824882984161377, + "loss/hidden": 7.0625, + "loss/jsd": 0.0, + "loss/logits": 0.8673952519893646, + "step": 74 + }, + { + "epoch": 0.00475, + "grad_norm": 21.875, + "grad_norm_var": 649.4197916666667, + "learning_rate": 7.840000000000001e-05, + "loss": 18.5261, + "loss/crossentropy": 2.8125277757644653, + "loss/hidden": 7.109375, + "loss/jsd": 0.0, + "loss/logits": 0.8680737912654877, + "step": 76 + }, + { + "epoch": 0.004875, + "grad_norm": 12.1875, + "grad_norm_var": 658.5675618489583, + "learning_rate": 8.020000000000001e-05, + "loss": 18.4968, + "loss/crossentropy": 2.8050509691238403, + "loss/hidden": 6.828125, + "loss/jsd": 0.0, + "loss/logits": 0.8595540523529053, + "step": 78 + }, + { + "epoch": 0.005, + "grad_norm": 11.5, + "grad_norm_var": 669.7122233072917, + "learning_rate": 8.200000000000001e-05, + "loss": 18.0691, + "loss/crossentropy": 3.2670862674713135, + "loss/hidden": 6.8125, + "loss/jsd": 0.0, + "loss/logits": 0.8241342604160309, + "step": 80 + }, + { + "epoch": 0.005125, + "grad_norm": 12.625, + "grad_norm_var": 669.6054524739583, + "learning_rate": 8.38e-05, + "loss": 17.4693, + "loss/crossentropy": 2.700217127799988, + "loss/hidden": 6.765625, + "loss/jsd": 0.0, + "loss/logits": 0.8141748309135437, + "step": 82 + }, + { + "epoch": 0.00525, + "grad_norm": 11.4375, + "grad_norm_var": 679.333056640625, + "learning_rate": 8.560000000000001e-05, + "loss": 16.8553, + "loss/crossentropy": 2.619894862174988, + "loss/hidden": 6.578125, + "loss/jsd": 0.0, + "loss/logits": 0.755303144454956, + "step": 84 + }, + { + "epoch": 0.005375, + "grad_norm": 10.8125, + "grad_norm_var": 680.3473307291666, + "learning_rate": 8.740000000000001e-05, + "loss": 16.8983, + "loss/crossentropy": 2.8718719482421875, + "loss/hidden": 6.484375, + "loss/jsd": 0.0, + "loss/logits": 0.8335458338260651, + "step": 86 + }, + { + "epoch": 0.0055, + "grad_norm": 13.8125, + "grad_norm_var": 8.321858723958334, + "learning_rate": 8.92e-05, + "loss": 16.8672, + "loss/crossentropy": 2.806625247001648, + "loss/hidden": 6.546875, + "loss/jsd": 0.0, + "loss/logits": 0.7781052589416504, + "step": 88 + }, + { + "epoch": 0.005625, + "grad_norm": 12.1875, + "grad_norm_var": 7.507014973958333, + "learning_rate": 9.1e-05, + "loss": 16.4737, + "loss/crossentropy": 3.016478180885315, + "loss/hidden": 6.375, + "loss/jsd": 0.0, + "loss/logits": 0.7285971939563751, + "step": 90 + }, + { + "epoch": 0.00575, + "grad_norm": 13.25, + "grad_norm_var": 1.43046875, + "learning_rate": 9.28e-05, + "loss": 16.47, + "loss/crossentropy": 2.5847216844558716, + "loss/hidden": 6.359375, + "loss/jsd": 0.0, + "loss/logits": 0.6861400604248047, + "step": 92 + }, + { + "epoch": 0.005875, + "grad_norm": 9.875, + "grad_norm_var": 1.4640462239583334, + "learning_rate": 9.46e-05, + "loss": 16.3726, + "loss/crossentropy": 2.6700236797332764, + "loss/hidden": 6.328125, + "loss/jsd": 0.0, + "loss/logits": 0.7058612108230591, + "step": 94 + }, + { + "epoch": 0.006, + "grad_norm": 11.6875, + "grad_norm_var": 1.329541015625, + "learning_rate": 9.64e-05, + "loss": 16.0121, + "loss/crossentropy": 2.8999104499816895, + "loss/hidden": 6.203125, + "loss/jsd": 0.0, + "loss/logits": 0.7061053812503815, + "step": 96 + }, + { + "epoch": 0.006125, + "grad_norm": 12.25, + "grad_norm_var": 1.3148274739583334, + "learning_rate": 9.82e-05, + "loss": 15.9048, + "loss/crossentropy": 2.9132989645004272, + "loss/hidden": 6.234375, + "loss/jsd": 0.0, + "loss/logits": 0.7202947437763214, + "step": 98 + }, + { + "epoch": 0.00625, + "grad_norm": 10.1875, + "grad_norm_var": 1.2620930989583334, + "learning_rate": 0.0001, + "loss": 15.4134, + "loss/crossentropy": 2.6530884504318237, + "loss/hidden": 5.90625, + "loss/jsd": 0.0, + "loss/logits": 0.6581160724163055, + "step": 100 + }, + { + "epoch": 0.006375, + "grad_norm": 12.8125, + "grad_norm_var": 2.23736572265625, + "learning_rate": 0.0001, + "loss": 15.5444, + "loss/crossentropy": 2.285265564918518, + "loss/hidden": 6.15625, + "loss/jsd": 0.0, + "loss/logits": 0.6365102231502533, + "step": 102 + }, + { + "epoch": 0.0065, + "grad_norm": 12.25, + "grad_norm_var": 1.9623006184895833, + "learning_rate": 0.0001, + "loss": 15.3962, + "loss/crossentropy": 2.9150387048721313, + "loss/hidden": 5.890625, + "loss/jsd": 0.0, + "loss/logits": 0.6743068099021912, + "step": 104 + }, + { + "epoch": 0.006625, + "grad_norm": 11.375, + "grad_norm_var": 1.9106730143229167, + "learning_rate": 0.0001, + "loss": 15.0494, + "loss/crossentropy": 2.461984634399414, + "loss/hidden": 5.875, + "loss/jsd": 0.0, + "loss/logits": 0.5759885013103485, + "step": 106 + }, + { + "epoch": 0.00675, + "grad_norm": 9.4375, + "grad_norm_var": 1.83931884765625, + "learning_rate": 0.0001, + "loss": 15.2, + "loss/crossentropy": 2.545448660850525, + "loss/hidden": 5.828125, + "loss/jsd": 0.0, + "loss/logits": 0.6016611158847809, + "step": 108 + }, + { + "epoch": 0.006875, + "grad_norm": 9.5, + "grad_norm_var": 2.117997233072917, + "learning_rate": 0.0001, + "loss": 14.7974, + "loss/crossentropy": 2.70013689994812, + "loss/hidden": 5.8125, + "loss/jsd": 0.0, + "loss/logits": 0.6241994798183441, + "step": 110 + }, + { + "epoch": 0.007, + "grad_norm": 11.6875, + "grad_norm_var": 2.129715983072917, + "learning_rate": 0.0001, + "loss": 14.9825, + "loss/crossentropy": 2.7020071744918823, + "loss/hidden": 5.765625, + "loss/jsd": 0.0, + "loss/logits": 0.6234863996505737, + "step": 112 + }, + { + "epoch": 0.007125, + "grad_norm": 10.0, + "grad_norm_var": 2.9886067708333335, + "learning_rate": 0.0001, + "loss": 14.717, + "loss/crossentropy": 2.513030529022217, + "loss/hidden": 5.765625, + "loss/jsd": 0.0, + "loss/logits": 0.6006259322166443, + "step": 114 + }, + { + "epoch": 0.00725, + "grad_norm": 9.8125, + "grad_norm_var": 2.908317057291667, + "learning_rate": 0.0001, + "loss": 14.5928, + "loss/crossentropy": 2.696964979171753, + "loss/hidden": 5.640625, + "loss/jsd": 0.0, + "loss/logits": 0.6187423169612885, + "step": 116 + }, + { + "epoch": 0.007375, + "grad_norm": 8.0, + "grad_norm_var": 2.4882771809895834, + "learning_rate": 0.0001, + "loss": 14.4255, + "loss/crossentropy": 2.6013330221176147, + "loss/hidden": 5.578125, + "loss/jsd": 0.0, + "loss/logits": 0.6197507381439209, + "step": 118 + }, + { + "epoch": 0.0075, + "grad_norm": 9.5625, + "grad_norm_var": 2.2302042643229165, + "learning_rate": 0.0001, + "loss": 14.3271, + "loss/crossentropy": 2.411963939666748, + "loss/hidden": 5.609375, + "loss/jsd": 0.0, + "loss/logits": 0.625188797712326, + "step": 120 + }, + { + "epoch": 0.007625, + "grad_norm": 7.84375, + "grad_norm_var": 2.220686848958333, + "learning_rate": 0.0001, + "loss": 14.2801, + "loss/crossentropy": 2.6053736209869385, + "loss/hidden": 5.6875, + "loss/jsd": 0.0, + "loss/logits": 0.6165933012962341, + "step": 122 + }, + { + "epoch": 0.00775, + "grad_norm": 8.9375, + "grad_norm_var": 1.343603515625, + "learning_rate": 0.0001, + "loss": 14.212, + "loss/crossentropy": 2.693827986717224, + "loss/hidden": 5.546875, + "loss/jsd": 0.0, + "loss/logits": 0.593802809715271, + "step": 124 + }, + { + "epoch": 0.007875, + "grad_norm": 8.9375, + "grad_norm_var": 1.392822265625, + "learning_rate": 0.0001, + "loss": 14.083, + "loss/crossentropy": 2.649814248085022, + "loss/hidden": 5.484375, + "loss/jsd": 0.0, + "loss/logits": 0.5663131475448608, + "step": 126 + }, + { + "epoch": 0.008, + "grad_norm": 7.90625, + "grad_norm_var": 0.8796183268229166, + "learning_rate": 0.0001, + "loss": 13.7585, + "loss/crossentropy": 2.813218355178833, + "loss/hidden": 5.640625, + "loss/jsd": 0.0, + "loss/logits": 0.619841456413269, + "step": 128 + }, + { + "epoch": 0.008125, + "grad_norm": 9.4375, + "grad_norm_var": 0.5834635416666667, + "learning_rate": 0.0001, + "loss": 13.7834, + "loss/crossentropy": 2.496381640434265, + "loss/hidden": 5.34375, + "loss/jsd": 0.0, + "loss/logits": 0.534807562828064, + "step": 130 + }, + { + "epoch": 0.00825, + "grad_norm": 7.15625, + "grad_norm_var": 0.73668212890625, + "learning_rate": 0.0001, + "loss": 13.8102, + "loss/crossentropy": 2.587761878967285, + "loss/hidden": 5.6875, + "loss/jsd": 0.0, + "loss/logits": 0.5979687869548798, + "step": 132 + }, + { + "epoch": 0.008375, + "grad_norm": 11.0625, + "grad_norm_var": 0.9780232747395833, + "learning_rate": 0.0001, + "loss": 13.7661, + "loss/crossentropy": 3.0042346715927124, + "loss/hidden": 5.4375, + "loss/jsd": 0.0, + "loss/logits": 0.568140983581543, + "step": 134 + }, + { + "epoch": 0.0085, + "grad_norm": 9.5625, + "grad_norm_var": 0.8787394205729167, + "learning_rate": 0.0001, + "loss": 13.7338, + "loss/crossentropy": 2.4913647174835205, + "loss/hidden": 5.53125, + "loss/jsd": 0.0, + "loss/logits": 0.5167834609746933, + "step": 136 + }, + { + "epoch": 0.008625, + "grad_norm": 7.4375, + "grad_norm_var": 1.00718994140625, + "learning_rate": 0.0001, + "loss": 13.6016, + "loss/crossentropy": 2.468238115310669, + "loss/hidden": 5.28125, + "loss/jsd": 0.0, + "loss/logits": 0.5614461004734039, + "step": 138 + }, + { + "epoch": 0.00875, + "grad_norm": 8.8125, + "grad_norm_var": 1.0800740559895834, + "learning_rate": 0.0001, + "loss": 13.5316, + "loss/crossentropy": 2.4444445371627808, + "loss/hidden": 5.3125, + "loss/jsd": 0.0, + "loss/logits": 0.5187713205814362, + "step": 140 + }, + { + "epoch": 0.008875, + "grad_norm": 6.96875, + "grad_norm_var": 1.2684895833333334, + "learning_rate": 0.0001, + "loss": 13.0968, + "loss/crossentropy": 2.6214382648468018, + "loss/hidden": 5.265625, + "loss/jsd": 0.0, + "loss/logits": 0.5483916699886322, + "step": 142 + }, + { + "epoch": 0.009, + "grad_norm": 8.375, + "grad_norm_var": 1.2163045247395834, + "learning_rate": 0.0001, + "loss": 13.202, + "loss/crossentropy": 2.7945733070373535, + "loss/hidden": 5.265625, + "loss/jsd": 0.0, + "loss/logits": 0.5334698259830475, + "step": 144 + }, + { + "epoch": 0.009125, + "grad_norm": 8.375, + "grad_norm_var": 1.11070556640625, + "learning_rate": 0.0001, + "loss": 13.4962, + "loss/crossentropy": 2.6263811588287354, + "loss/hidden": 5.3125, + "loss/jsd": 0.0, + "loss/logits": 0.5304541736841202, + "step": 146 + }, + { + "epoch": 0.00925, + "grad_norm": 6.46875, + "grad_norm_var": 1.1537394205729166, + "learning_rate": 0.0001, + "loss": 13.0194, + "loss/crossentropy": 2.446092367172241, + "loss/hidden": 5.109375, + "loss/jsd": 0.0, + "loss/logits": 0.4983871430158615, + "step": 148 + }, + { + "epoch": 0.009375, + "grad_norm": 8.875, + "grad_norm_var": 0.7020182291666667, + "learning_rate": 0.0001, + "loss": 13.4196, + "loss/crossentropy": 2.954146981239319, + "loss/hidden": 5.265625, + "loss/jsd": 0.0, + "loss/logits": 0.5594009757041931, + "step": 150 + }, + { + "epoch": 0.0095, + "grad_norm": 7.9375, + "grad_norm_var": 0.563916015625, + "learning_rate": 0.0001, + "loss": 13.1519, + "loss/crossentropy": 2.7116650342941284, + "loss/hidden": 5.25, + "loss/jsd": 0.0, + "loss/logits": 0.5489330589771271, + "step": 152 + }, + { + "epoch": 0.009625, + "grad_norm": 6.875, + "grad_norm_var": 0.66314697265625, + "learning_rate": 0.0001, + "loss": 12.9977, + "loss/crossentropy": 2.6282447576522827, + "loss/hidden": 5.1875, + "loss/jsd": 0.0, + "loss/logits": 0.4889778196811676, + "step": 154 + }, + { + "epoch": 0.00975, + "grad_norm": 6.5625, + "grad_norm_var": 0.7279256184895834, + "learning_rate": 0.0001, + "loss": 12.9168, + "loss/crossentropy": 2.5541906356811523, + "loss/hidden": 5.140625, + "loss/jsd": 0.0, + "loss/logits": 0.5468989908695221, + "step": 156 + }, + { + "epoch": 0.009875, + "grad_norm": 8.25, + "grad_norm_var": 0.7774739583333333, + "learning_rate": 0.0001, + "loss": 12.8049, + "loss/crossentropy": 2.3998714685440063, + "loss/hidden": 5.171875, + "loss/jsd": 0.0, + "loss/logits": 0.4967530369758606, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 8.125, + "grad_norm_var": 0.6618448893229166, + "learning_rate": 0.0001, + "loss": 12.8303, + "loss/crossentropy": 2.5461435317993164, + "loss/hidden": 5.09375, + "loss/jsd": 0.0, + "loss/logits": 0.5074218511581421, + "step": 160 + }, + { + "epoch": 0.010125, + "grad_norm": 7.8125, + "grad_norm_var": 0.6085245768229167, + "learning_rate": 0.0001, + "loss": 12.7334, + "loss/crossentropy": 2.2296184301376343, + "loss/hidden": 5.03125, + "loss/jsd": 0.0, + "loss/logits": 0.4967309385538101, + "step": 162 + }, + { + "epoch": 0.01025, + "grad_norm": 7.28125, + "grad_norm_var": 0.48121337890625, + "learning_rate": 0.0001, + "loss": 12.8275, + "loss/crossentropy": 2.348438858985901, + "loss/hidden": 5.09375, + "loss/jsd": 0.0, + "loss/logits": 0.47058284282684326, + "step": 164 + }, + { + "epoch": 0.010375, + "grad_norm": 7.0, + "grad_norm_var": 0.39407552083333336, + "learning_rate": 0.0001, + "loss": 12.6261, + "loss/crossentropy": 2.4020251035690308, + "loss/hidden": 4.96875, + "loss/jsd": 0.0, + "loss/logits": 0.4758017808198929, + "step": 166 + }, + { + "epoch": 0.0105, + "grad_norm": 5.65625, + "grad_norm_var": 0.6719685872395833, + "learning_rate": 0.0001, + "loss": 12.7439, + "loss/crossentropy": 2.4506293535232544, + "loss/hidden": 4.9375, + "loss/jsd": 0.0, + "loss/logits": 0.5312856733798981, + "step": 168 + }, + { + "epoch": 0.010625, + "grad_norm": 7.40625, + "grad_norm_var": 0.6060506184895833, + "learning_rate": 0.0001, + "loss": 12.582, + "loss/crossentropy": 2.5331802368164062, + "loss/hidden": 4.9375, + "loss/jsd": 0.0, + "loss/logits": 0.5095243901014328, + "step": 170 + }, + { + "epoch": 0.01075, + "grad_norm": 8.625, + "grad_norm_var": 0.6595662434895834, + "learning_rate": 0.0001, + "loss": 12.4879, + "loss/crossentropy": 2.65364670753479, + "loss/hidden": 5.140625, + "loss/jsd": 0.0, + "loss/logits": 0.543939620256424, + "step": 172 + }, + { + "epoch": 0.010875, + "grad_norm": 6.21875, + "grad_norm_var": 0.6911417643229166, + "learning_rate": 0.0001, + "loss": 12.2931, + "loss/crossentropy": 2.2634752988815308, + "loss/hidden": 4.890625, + "loss/jsd": 0.0, + "loss/logits": 0.45544371008872986, + "step": 174 + }, + { + "epoch": 0.011, + "grad_norm": 7.21875, + "grad_norm_var": 0.6577473958333333, + "learning_rate": 0.0001, + "loss": 12.3996, + "loss/crossentropy": 2.3653087615966797, + "loss/hidden": 4.921875, + "loss/jsd": 0.0, + "loss/logits": 0.4665989428758621, + "step": 176 + }, + { + "epoch": 0.011125, + "grad_norm": 7.8125, + "grad_norm_var": 0.6873006184895833, + "learning_rate": 0.0001, + "loss": 12.335, + "loss/crossentropy": 2.274166226387024, + "loss/hidden": 4.9375, + "loss/jsd": 0.0, + "loss/logits": 0.48722338676452637, + "step": 178 + }, + { + "epoch": 0.01125, + "grad_norm": 8.875, + "grad_norm_var": 0.8094685872395834, + "learning_rate": 0.0001, + "loss": 12.4249, + "loss/crossentropy": 2.464481830596924, + "loss/hidden": 4.953125, + "loss/jsd": 0.0, + "loss/logits": 0.5341024994850159, + "step": 180 + }, + { + "epoch": 0.011375, + "grad_norm": 6.84375, + "grad_norm_var": 0.8151692708333333, + "learning_rate": 0.0001, + "loss": 12.12, + "loss/crossentropy": 2.5521204471588135, + "loss/hidden": 5.09375, + "loss/jsd": 0.0, + "loss/logits": 0.5266247987747192, + "step": 182 + }, + { + "epoch": 0.0115, + "grad_norm": 6.375, + "grad_norm_var": 0.6587076822916667, + "learning_rate": 0.0001, + "loss": 12.0581, + "loss/crossentropy": 2.380069375038147, + "loss/hidden": 4.921875, + "loss/jsd": 0.0, + "loss/logits": 0.4448339492082596, + "step": 184 + }, + { + "epoch": 0.011625, + "grad_norm": 7.46875, + "grad_norm_var": 0.6198527018229166, + "learning_rate": 0.0001, + "loss": 12.2557, + "loss/crossentropy": 2.6351869106292725, + "loss/hidden": 4.921875, + "loss/jsd": 0.0, + "loss/logits": 0.51109179854393, + "step": 186 + }, + { + "epoch": 0.01175, + "grad_norm": 6.625, + "grad_norm_var": 0.461572265625, + "learning_rate": 0.0001, + "loss": 12.3102, + "loss/crossentropy": 2.606539011001587, + "loss/hidden": 4.90625, + "loss/jsd": 0.0, + "loss/logits": 0.4482808858156204, + "step": 188 + }, + { + "epoch": 0.011875, + "grad_norm": 7.3125, + "grad_norm_var": 0.40491129557291666, + "learning_rate": 0.0001, + "loss": 11.8983, + "loss/crossentropy": 2.5177031755447388, + "loss/hidden": 4.8125, + "loss/jsd": 0.0, + "loss/logits": 0.4657522886991501, + "step": 190 + }, + { + "epoch": 0.012, + "grad_norm": 5.40625, + "grad_norm_var": 0.5950358072916667, + "learning_rate": 0.0001, + "loss": 11.8962, + "loss/crossentropy": 2.5478276014328003, + "loss/hidden": 4.90625, + "loss/jsd": 0.0, + "loss/logits": 0.4511236548423767, + "step": 192 + }, + { + "epoch": 0.012125, + "grad_norm": 7.25, + "grad_norm_var": 0.5598592122395833, + "learning_rate": 0.0001, + "loss": 11.9408, + "loss/crossentropy": 2.08566415309906, + "loss/hidden": 4.90625, + "loss/jsd": 0.0, + "loss/logits": 0.4315005987882614, + "step": 194 + }, + { + "epoch": 0.01225, + "grad_norm": 7.03125, + "grad_norm_var": 0.40950113932291665, + "learning_rate": 0.0001, + "loss": 12.0969, + "loss/crossentropy": 2.634473204612732, + "loss/hidden": 4.84375, + "loss/jsd": 0.0, + "loss/logits": 0.4682666063308716, + "step": 196 + }, + { + "epoch": 0.012375, + "grad_norm": 7.25, + "grad_norm_var": 0.4279296875, + "learning_rate": 0.0001, + "loss": 12.0544, + "loss/crossentropy": 2.6797198057174683, + "loss/hidden": 4.84375, + "loss/jsd": 0.0, + "loss/logits": 0.4552183598279953, + "step": 198 + }, + { + "epoch": 0.0125, + "grad_norm": 6.75, + "grad_norm_var": 0.4488118489583333, + "learning_rate": 0.0001, + "loss": 11.9949, + "loss/crossentropy": 2.9568880796432495, + "loss/hidden": 4.921875, + "loss/jsd": 0.0, + "loss/logits": 0.5152240097522736, + "step": 200 + }, + { + "epoch": 0.012625, + "grad_norm": 6.1875, + "grad_norm_var": 0.4198527018229167, + "learning_rate": 0.0001, + "loss": 11.876, + "loss/crossentropy": 2.4664944410324097, + "loss/hidden": 4.734375, + "loss/jsd": 0.0, + "loss/logits": 0.4684390127658844, + "step": 202 + }, + { + "epoch": 0.01275, + "grad_norm": 6.5625, + "grad_norm_var": 0.41952718098958336, + "learning_rate": 0.0001, + "loss": 11.9644, + "loss/crossentropy": 2.580668091773987, + "loss/hidden": 4.78125, + "loss/jsd": 0.0, + "loss/logits": 0.4738956689834595, + "step": 204 + }, + { + "epoch": 0.012875, + "grad_norm": 6.3125, + "grad_norm_var": 0.39635009765625, + "learning_rate": 0.0001, + "loss": 12.0377, + "loss/crossentropy": 2.367862343788147, + "loss/hidden": 4.671875, + "loss/jsd": 0.0, + "loss/logits": 0.45996397733688354, + "step": 206 + }, + { + "epoch": 0.013, + "grad_norm": 6.09375, + "grad_norm_var": 0.27681884765625, + "learning_rate": 0.0001, + "loss": 11.9037, + "loss/crossentropy": 2.5246529579162598, + "loss/hidden": 4.765625, + "loss/jsd": 0.0, + "loss/logits": 0.46563032269477844, + "step": 208 + }, + { + "epoch": 0.013125, + "grad_norm": 6.46875, + "grad_norm_var": 0.24596354166666667, + "learning_rate": 0.0001, + "loss": 11.7128, + "loss/crossentropy": 2.20585036277771, + "loss/hidden": 4.640625, + "loss/jsd": 0.0, + "loss/logits": 0.41547301411628723, + "step": 210 + }, + { + "epoch": 0.01325, + "grad_norm": 5.9375, + "grad_norm_var": 0.2263671875, + "learning_rate": 0.0001, + "loss": 11.7218, + "loss/crossentropy": 2.3064881563186646, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.4325401932001114, + "step": 212 + }, + { + "epoch": 0.013375, + "grad_norm": 7.03125, + "grad_norm_var": 0.16054280598958334, + "learning_rate": 0.0001, + "loss": 11.5407, + "loss/crossentropy": 2.3898541927337646, + "loss/hidden": 4.609375, + "loss/jsd": 0.0, + "loss/logits": 0.43332116305828094, + "step": 214 + }, + { + "epoch": 0.0135, + "grad_norm": 6.5625, + "grad_norm_var": 0.150244140625, + "learning_rate": 0.0001, + "loss": 11.8046, + "loss/crossentropy": 2.456748604774475, + "loss/hidden": 4.6875, + "loss/jsd": 0.0, + "loss/logits": 0.4493984282016754, + "step": 216 + }, + { + "epoch": 0.013625, + "grad_norm": 6.25, + "grad_norm_var": 0.18606770833333333, + "learning_rate": 0.0001, + "loss": 11.8232, + "loss/crossentropy": 2.7504690885543823, + "loss/hidden": 4.6875, + "loss/jsd": 0.0, + "loss/logits": 0.44959259033203125, + "step": 218 + }, + { + "epoch": 0.01375, + "grad_norm": 6.65625, + "grad_norm_var": 0.18313802083333333, + "learning_rate": 0.0001, + "loss": 11.7742, + "loss/crossentropy": 2.272356390953064, + "loss/hidden": 4.625, + "loss/jsd": 0.0, + "loss/logits": 0.4380947947502136, + "step": 220 + }, + { + "epoch": 0.013875, + "grad_norm": 5.71875, + "grad_norm_var": 0.20331624348958333, + "learning_rate": 0.0001, + "loss": 11.7707, + "loss/crossentropy": 2.539223790168762, + "loss/hidden": 4.640625, + "loss/jsd": 0.0, + "loss/logits": 0.41349247097969055, + "step": 222 + }, + { + "epoch": 0.014, + "grad_norm": 6.3125, + "grad_norm_var": 0.21122639973958332, + "learning_rate": 0.0001, + "loss": 11.6392, + "loss/crossentropy": 2.6272025108337402, + "loss/hidden": 4.6875, + "loss/jsd": 0.0, + "loss/logits": 0.4406648874282837, + "step": 224 + }, + { + "epoch": 0.014125, + "grad_norm": 5.65625, + "grad_norm_var": 0.23255208333333333, + "learning_rate": 0.0001, + "loss": 11.3455, + "loss/crossentropy": 2.1202113032341003, + "loss/hidden": 4.5625, + "loss/jsd": 0.0, + "loss/logits": 0.42472922801971436, + "step": 226 + }, + { + "epoch": 0.01425, + "grad_norm": 6.5625, + "grad_norm_var": 0.23212483723958333, + "learning_rate": 0.0001, + "loss": 11.5275, + "loss/crossentropy": 2.420728087425232, + "loss/hidden": 4.640625, + "loss/jsd": 0.0, + "loss/logits": 0.4595801681280136, + "step": 228 + }, + { + "epoch": 0.014375, + "grad_norm": 5.65625, + "grad_norm_var": 0.22057291666666667, + "learning_rate": 0.0001, + "loss": 11.849, + "loss/crossentropy": 2.58668851852417, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.44095560908317566, + "step": 230 + }, + { + "epoch": 0.0145, + "grad_norm": 5.28125, + "grad_norm_var": 0.210546875, + "learning_rate": 0.0001, + "loss": 11.4306, + "loss/crossentropy": 2.3560508489608765, + "loss/hidden": 4.578125, + "loss/jsd": 0.0, + "loss/logits": 0.4567548930644989, + "step": 232 + }, + { + "epoch": 0.014625, + "grad_norm": 5.78125, + "grad_norm_var": 0.19872639973958334, + "learning_rate": 0.0001, + "loss": 11.41, + "loss/crossentropy": 2.362083673477173, + "loss/hidden": 4.546875, + "loss/jsd": 0.0, + "loss/logits": 0.4453700929880142, + "step": 234 + }, + { + "epoch": 0.01475, + "grad_norm": 6.625, + "grad_norm_var": 0.18801676432291667, + "learning_rate": 0.0001, + "loss": 11.3155, + "loss/crossentropy": 2.6302807331085205, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.45155879855155945, + "step": 236 + }, + { + "epoch": 0.014875, + "grad_norm": 6.28125, + "grad_norm_var": 0.16236979166666668, + "learning_rate": 0.0001, + "loss": 11.5746, + "loss/crossentropy": 2.504029393196106, + "loss/hidden": 4.53125, + "loss/jsd": 0.0, + "loss/logits": 0.423601895570755, + "step": 238 + }, + { + "epoch": 0.015, + "grad_norm": 5.84375, + "grad_norm_var": 0.14752604166666666, + "learning_rate": 0.0001, + "loss": 11.3137, + "loss/crossentropy": 2.496834635734558, + "loss/hidden": 4.484375, + "loss/jsd": 0.0, + "loss/logits": 0.44761495292186737, + "step": 240 + }, + { + "epoch": 0.015125, + "grad_norm": 5.75, + "grad_norm_var": 0.14377848307291666, + "learning_rate": 0.0001, + "loss": 11.215, + "loss/crossentropy": 2.5440114736557007, + "loss/hidden": 4.5, + "loss/jsd": 0.0, + "loss/logits": 0.4467613846063614, + "step": 242 + }, + { + "epoch": 0.01525, + "grad_norm": 6.8125, + "grad_norm_var": 0.16174723307291666, + "learning_rate": 0.0001, + "loss": 11.4828, + "loss/crossentropy": 2.3933448791503906, + "loss/hidden": 4.484375, + "loss/jsd": 0.0, + "loss/logits": 0.4251607805490494, + "step": 244 + }, + { + "epoch": 0.015375, + "grad_norm": 5.28125, + "grad_norm_var": 0.19026285807291668, + "learning_rate": 0.0001, + "loss": 11.2663, + "loss/crossentropy": 2.709121346473694, + "loss/hidden": 4.609375, + "loss/jsd": 0.0, + "loss/logits": 0.4511110782623291, + "step": 246 + }, + { + "epoch": 0.0155, + "grad_norm": 6.375, + "grad_norm_var": 0.18007405598958334, + "learning_rate": 0.0001, + "loss": 11.2542, + "loss/crossentropy": 2.65135395526886, + "loss/hidden": 4.546875, + "loss/jsd": 0.0, + "loss/logits": 0.42732760310173035, + "step": 248 + }, + { + "epoch": 0.015625, + "grad_norm": 5.5, + "grad_norm_var": 0.25390218098958334, + "learning_rate": 0.0001, + "loss": 11.0863, + "loss/crossentropy": 2.531478524208069, + "loss/hidden": 4.484375, + "loss/jsd": 0.0, + "loss/logits": 0.39773157238960266, + "step": 250 + }, + { + "epoch": 0.01575, + "grad_norm": 5.75, + "grad_norm_var": 0.23079427083333334, + "learning_rate": 0.0001, + "loss": 11.3519, + "loss/crossentropy": 2.1759145259857178, + "loss/hidden": 4.515625, + "loss/jsd": 0.0, + "loss/logits": 0.40788644552230835, + "step": 252 + }, + { + "epoch": 0.015875, + "grad_norm": 6.28125, + "grad_norm_var": 0.246337890625, + "learning_rate": 0.0001, + "loss": 10.9975, + "loss/crossentropy": 2.486463189125061, + "loss/hidden": 4.484375, + "loss/jsd": 0.0, + "loss/logits": 0.4049537926912308, + "step": 254 + }, + { + "epoch": 0.016, + "grad_norm": 4.90625, + "grad_norm_var": 0.32945556640625, + "learning_rate": 0.0001, + "loss": 10.9813, + "loss/crossentropy": 2.3456650972366333, + "loss/hidden": 4.5, + "loss/jsd": 0.0, + "loss/logits": 0.3914157599210739, + "step": 256 + }, + { + "epoch": 0.016125, + "grad_norm": 5.8125, + "grad_norm_var": 0.35302327473958334, + "learning_rate": 0.0001, + "loss": 11.124, + "loss/crossentropy": 2.7621407508850098, + "loss/hidden": 4.453125, + "loss/jsd": 0.0, + "loss/logits": 0.4217756986618042, + "step": 258 + }, + { + "epoch": 0.01625, + "grad_norm": 5.09375, + "grad_norm_var": 0.30201416015625, + "learning_rate": 0.0001, + "loss": 11.0913, + "loss/crossentropy": 2.520516872406006, + "loss/hidden": 4.484375, + "loss/jsd": 0.0, + "loss/logits": 0.42682692408561707, + "step": 260 + }, + { + "epoch": 0.016375, + "grad_norm": 5.875, + "grad_norm_var": 0.289697265625, + "learning_rate": 0.0001, + "loss": 11.0406, + "loss/crossentropy": 2.668264865875244, + "loss/hidden": 4.421875, + "loss/jsd": 0.0, + "loss/logits": 0.44012486934661865, + "step": 262 + }, + { + "epoch": 0.0165, + "grad_norm": 5.71875, + "grad_norm_var": 0.221337890625, + "learning_rate": 0.0001, + "loss": 11.1313, + "loss/crossentropy": 2.6228344440460205, + "loss/hidden": 4.40625, + "loss/jsd": 0.0, + "loss/logits": 0.47360049188137054, + "step": 264 + }, + { + "epoch": 0.016625, + "grad_norm": 5.1875, + "grad_norm_var": 0.2591796875, + "learning_rate": 0.0001, + "loss": 10.8727, + "loss/crossentropy": 2.07044917345047, + "loss/hidden": 4.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3736244738101959, + "step": 266 + }, + { + "epoch": 0.01675, + "grad_norm": 5.59375, + "grad_norm_var": 0.24763997395833334, + "learning_rate": 0.0001, + "loss": 10.9825, + "loss/crossentropy": 2.195676624774933, + "loss/hidden": 4.375, + "loss/jsd": 0.0, + "loss/logits": 0.41074641048908234, + "step": 268 + }, + { + "epoch": 0.016875, + "grad_norm": 5.15625, + "grad_norm_var": 0.16717122395833334, + "learning_rate": 0.0001, + "loss": 10.9858, + "loss/crossentropy": 2.6045761108398438, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.43537537753582, + "step": 270 + }, + { + "epoch": 0.017, + "grad_norm": 5.40625, + "grad_norm_var": 0.17616780598958334, + "learning_rate": 0.0001, + "loss": 11.1417, + "loss/crossentropy": 2.343075156211853, + "loss/hidden": 4.4375, + "loss/jsd": 0.0, + "loss/logits": 0.3932172954082489, + "step": 272 + }, + { + "epoch": 0.017125, + "grad_norm": 5.96875, + "grad_norm_var": 0.18162434895833332, + "learning_rate": 0.0001, + "loss": 10.9988, + "loss/crossentropy": 2.4649263620376587, + "loss/hidden": 4.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3909170925617218, + "step": 274 + }, + { + "epoch": 0.01725, + "grad_norm": 4.5, + "grad_norm_var": 0.23905843098958332, + "learning_rate": 0.0001, + "loss": 10.9516, + "loss/crossentropy": 2.3632274866104126, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.39788326621055603, + "step": 276 + }, + { + "epoch": 0.017375, + "grad_norm": 5.78125, + "grad_norm_var": 0.24231363932291666, + "learning_rate": 0.0001, + "loss": 10.9188, + "loss/crossentropy": 2.5861356258392334, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.41938433051109314, + "step": 278 + }, + { + "epoch": 0.0175, + "grad_norm": 5.21875, + "grad_norm_var": 0.23834635416666666, + "learning_rate": 0.0001, + "loss": 10.92, + "loss/crossentropy": 2.5754419565200806, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.4088515043258667, + "step": 280 + }, + { + "epoch": 0.017625, + "grad_norm": 5.53125, + "grad_norm_var": 0.21184895833333334, + "learning_rate": 0.0001, + "loss": 10.7969, + "loss/crossentropy": 2.241589307785034, + "loss/hidden": 4.328125, + "loss/jsd": 0.0, + "loss/logits": 0.38494938611984253, + "step": 282 + }, + { + "epoch": 0.01775, + "grad_norm": 5.40625, + "grad_norm_var": 0.3136678059895833, + "learning_rate": 0.0001, + "loss": 10.9124, + "loss/crossentropy": 2.332160472869873, + "loss/hidden": 4.40625, + "loss/jsd": 0.0, + "loss/logits": 0.4286513030529022, + "step": 284 + }, + { + "epoch": 0.017875, + "grad_norm": 5.84375, + "grad_norm_var": 0.2992472330729167, + "learning_rate": 0.0001, + "loss": 10.8958, + "loss/crossentropy": 2.452752709388733, + "loss/hidden": 4.328125, + "loss/jsd": 0.0, + "loss/logits": 0.37612101435661316, + "step": 286 + }, + { + "epoch": 0.018, + "grad_norm": 4.84375, + "grad_norm_var": 0.249072265625, + "learning_rate": 0.0001, + "loss": 10.7195, + "loss/crossentropy": 2.2290940284729004, + "loss/hidden": 4.28125, + "loss/jsd": 0.0, + "loss/logits": 0.37554706633090973, + "step": 288 + }, + { + "epoch": 0.018125, + "grad_norm": 5.875, + "grad_norm_var": 0.24680989583333332, + "learning_rate": 0.0001, + "loss": 10.7342, + "loss/crossentropy": 2.4139484167099, + "loss/hidden": 4.265625, + "loss/jsd": 0.0, + "loss/logits": 0.40335869789123535, + "step": 290 + }, + { + "epoch": 0.01825, + "grad_norm": 4.78125, + "grad_norm_var": 0.21402587890625, + "learning_rate": 0.0001, + "loss": 10.6719, + "loss/crossentropy": 2.56483793258667, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.41296976804733276, + "step": 292 + }, + { + "epoch": 0.018375, + "grad_norm": 6.375, + "grad_norm_var": 0.24823811848958333, + "learning_rate": 0.0001, + "loss": 10.9241, + "loss/crossentropy": 2.3277297019958496, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.3984246253967285, + "step": 294 + }, + { + "epoch": 0.0185, + "grad_norm": 4.71875, + "grad_norm_var": 0.28815104166666666, + "learning_rate": 0.0001, + "loss": 10.7746, + "loss/crossentropy": 2.3450149297714233, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.3996598720550537, + "step": 296 + }, + { + "epoch": 0.018625, + "grad_norm": 5.03125, + "grad_norm_var": 0.352734375, + "learning_rate": 0.0001, + "loss": 10.5621, + "loss/crossentropy": 2.317037582397461, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.40160971879959106, + "step": 298 + }, + { + "epoch": 0.01875, + "grad_norm": 5.3125, + "grad_norm_var": 0.23987223307291666, + "learning_rate": 0.0001, + "loss": 10.9369, + "loss/crossentropy": 2.5068975687026978, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.410100519657135, + "step": 300 + }, + { + "epoch": 0.018875, + "grad_norm": 4.9375, + "grad_norm_var": 0.22589518229166666, + "learning_rate": 0.0001, + "loss": 10.7482, + "loss/crossentropy": 2.351959705352783, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.39118409156799316, + "step": 302 + }, + { + "epoch": 0.019, + "grad_norm": 5.59375, + "grad_norm_var": 0.21608072916666668, + "learning_rate": 0.0001, + "loss": 10.7665, + "loss/crossentropy": 2.3877638578414917, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.39836424589157104, + "step": 304 + }, + { + "epoch": 0.019125, + "grad_norm": 5.53125, + "grad_norm_var": 0.19894205729166667, + "learning_rate": 0.0001, + "loss": 10.7703, + "loss/crossentropy": 2.600021004676819, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.41590385138988495, + "step": 306 + }, + { + "epoch": 0.01925, + "grad_norm": 5.40625, + "grad_norm_var": 0.18485921223958332, + "learning_rate": 0.0001, + "loss": 10.6674, + "loss/crossentropy": 2.4758663177490234, + "loss/hidden": 4.28125, + "loss/jsd": 0.0, + "loss/logits": 0.39325758814811707, + "step": 308 + }, + { + "epoch": 0.019375, + "grad_norm": 5.25, + "grad_norm_var": 0.10755208333333334, + "learning_rate": 0.0001, + "loss": 10.5967, + "loss/crossentropy": 2.3135849237442017, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.3709346354007721, + "step": 310 + }, + { + "epoch": 0.0195, + "grad_norm": 4.34375, + "grad_norm_var": 0.20885416666666667, + "learning_rate": 0.0001, + "loss": 10.7752, + "loss/crossentropy": 2.4139580726623535, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.4005644619464874, + "step": 312 + }, + { + "epoch": 0.019625, + "grad_norm": 6.03125, + "grad_norm_var": 0.21534830729166668, + "learning_rate": 0.0001, + "loss": 10.779, + "loss/crossentropy": 2.5271564722061157, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.37980175018310547, + "step": 314 + }, + { + "epoch": 0.01975, + "grad_norm": 4.53125, + "grad_norm_var": 0.25982666015625, + "learning_rate": 0.0001, + "loss": 10.5117, + "loss/crossentropy": 2.5739694833755493, + "loss/hidden": 4.28125, + "loss/jsd": 0.0, + "loss/logits": 0.40202172100543976, + "step": 316 + }, + { + "epoch": 0.019875, + "grad_norm": 6.625, + "grad_norm_var": 0.36412353515625, + "learning_rate": 0.0001, + "loss": 10.5532, + "loss/crossentropy": 2.245994448661804, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.36041176319122314, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 4.65625, + "grad_norm_var": 0.392041015625, + "learning_rate": 0.0001, + "loss": 10.5765, + "loss/crossentropy": 2.5354862213134766, + "loss/hidden": 4.28125, + "loss/jsd": 0.0, + "loss/logits": 0.3732207715511322, + "step": 320 + }, + { + "epoch": 0.020125, + "grad_norm": 5.8125, + "grad_norm_var": 0.463134765625, + "learning_rate": 0.0001, + "loss": 10.4762, + "loss/crossentropy": 2.3753507137298584, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.3840855211019516, + "step": 322 + }, + { + "epoch": 0.02025, + "grad_norm": 4.90625, + "grad_norm_var": 0.4641927083333333, + "learning_rate": 0.0001, + "loss": 10.2923, + "loss/crossentropy": 2.4433764219284058, + "loss/hidden": 4.125, + "loss/jsd": 0.0, + "loss/logits": 0.38050127029418945, + "step": 324 + }, + { + "epoch": 0.020375, + "grad_norm": 5.21875, + "grad_norm_var": 0.5068644205729167, + "learning_rate": 0.0001, + "loss": 10.6858, + "loss/crossentropy": 2.484220504760742, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.4011761546134949, + "step": 326 + }, + { + "epoch": 0.0205, + "grad_norm": 5.3125, + "grad_norm_var": 0.3748697916666667, + "learning_rate": 0.0001, + "loss": 10.7329, + "loss/crossentropy": 2.6139092445373535, + "loss/hidden": 4.203125, + "loss/jsd": 0.0, + "loss/logits": 0.4130321443080902, + "step": 328 + }, + { + "epoch": 0.020625, + "grad_norm": 5.0, + "grad_norm_var": 0.31311442057291666, + "learning_rate": 0.0001, + "loss": 10.606, + "loss/crossentropy": 2.4625054597854614, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.3967062383890152, + "step": 330 + }, + { + "epoch": 0.02075, + "grad_norm": 5.0625, + "grad_norm_var": 0.2874837239583333, + "learning_rate": 0.0001, + "loss": 10.5546, + "loss/crossentropy": 2.3454889059066772, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.3706536889076233, + "step": 332 + }, + { + "epoch": 0.020875, + "grad_norm": 5.4375, + "grad_norm_var": 0.18229166666666666, + "learning_rate": 0.0001, + "loss": 10.5487, + "loss/crossentropy": 2.4348955154418945, + "loss/hidden": 4.1875, + "loss/jsd": 0.0, + "loss/logits": 0.37259407341480255, + "step": 334 + }, + { + "epoch": 0.021, + "grad_norm": 4.25, + "grad_norm_var": 0.21061197916666666, + "learning_rate": 0.0001, + "loss": 10.3802, + "loss/crossentropy": 2.3722680807113647, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.37501636147499084, + "step": 336 + }, + { + "epoch": 0.021125, + "grad_norm": 5.6875, + "grad_norm_var": 0.17509358723958332, + "learning_rate": 0.0001, + "loss": 10.6118, + "loss/crossentropy": 2.367267608642578, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.37614843249320984, + "step": 338 + }, + { + "epoch": 0.02125, + "grad_norm": 4.96875, + "grad_norm_var": 0.18136393229166667, + "learning_rate": 0.0001, + "loss": 10.409, + "loss/crossentropy": 2.5342684984207153, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.4095509201288223, + "step": 340 + }, + { + "epoch": 0.021375, + "grad_norm": 4.25, + "grad_norm_var": 0.200634765625, + "learning_rate": 0.0001, + "loss": 10.347, + "loss/crossentropy": 2.366121530532837, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.38898931443691254, + "step": 342 + }, + { + "epoch": 0.0215, + "grad_norm": 5.96875, + "grad_norm_var": 0.391650390625, + "learning_rate": 0.0001, + "loss": 10.4284, + "loss/crossentropy": 2.435874819755554, + "loss/hidden": 4.265625, + "loss/jsd": 0.0, + "loss/logits": 0.4087076783180237, + "step": 344 + }, + { + "epoch": 0.021625, + "grad_norm": 5.46875, + "grad_norm_var": 0.39010416666666664, + "learning_rate": 0.0001, + "loss": 10.4112, + "loss/crossentropy": 2.4133975505828857, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3934263288974762, + "step": 346 + }, + { + "epoch": 0.02175, + "grad_norm": 4.8125, + "grad_norm_var": 0.40168863932291665, + "learning_rate": 0.0001, + "loss": 10.59, + "loss/crossentropy": 2.3464980125427246, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.36868566274642944, + "step": 348 + }, + { + "epoch": 0.021875, + "grad_norm": 5.15625, + "grad_norm_var": 0.3732421875, + "learning_rate": 0.0001, + "loss": 10.5281, + "loss/crossentropy": 2.492926597595215, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.37226299941539764, + "step": 350 + }, + { + "epoch": 0.022, + "grad_norm": 4.25, + "grad_norm_var": 0.37948811848958336, + "learning_rate": 0.0001, + "loss": 10.2246, + "loss/crossentropy": 2.3636070489883423, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.3681965172290802, + "step": 352 + }, + { + "epoch": 0.022125, + "grad_norm": 4.625, + "grad_norm_var": 0.3692708333333333, + "learning_rate": 0.0001, + "loss": 10.5053, + "loss/crossentropy": 2.288292169570923, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.3958088457584381, + "step": 354 + }, + { + "epoch": 0.02225, + "grad_norm": 5.15625, + "grad_norm_var": 0.38084309895833335, + "learning_rate": 0.0001, + "loss": 10.3737, + "loss/crossentropy": 2.451295018196106, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.39023011922836304, + "step": 356 + }, + { + "epoch": 0.022375, + "grad_norm": 4.59375, + "grad_norm_var": 0.3459269205729167, + "learning_rate": 0.0001, + "loss": 10.3773, + "loss/crossentropy": 2.3242127895355225, + "loss/hidden": 4.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.39542824029922485, + "step": 358 + }, + { + "epoch": 0.0225, + "grad_norm": 5.03125, + "grad_norm_var": 0.132666015625, + "learning_rate": 0.0001, + "loss": 10.531, + "loss/crossentropy": 2.5108137130737305, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.3721470236778259, + "step": 360 + }, + { + "epoch": 0.022625, + "grad_norm": 4.8125, + "grad_norm_var": 0.101416015625, + "learning_rate": 0.0001, + "loss": 10.4355, + "loss/crossentropy": 2.282769203186035, + "loss/hidden": 4.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.353266179561615, + "step": 362 + }, + { + "epoch": 0.02275, + "grad_norm": 5.40625, + "grad_norm_var": 0.13072916666666667, + "learning_rate": 0.0001, + "loss": 10.3682, + "loss/crossentropy": 2.3975025415420532, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.3848300874233246, + "step": 364 + }, + { + "epoch": 0.022875, + "grad_norm": 4.59375, + "grad_norm_var": 0.13717447916666667, + "learning_rate": 0.0001, + "loss": 10.3197, + "loss/crossentropy": 2.5082876682281494, + "loss/hidden": 4.203125, + "loss/jsd": 0.0, + "loss/logits": 0.38546115159988403, + "step": 366 + }, + { + "epoch": 0.023, + "grad_norm": 5.75, + "grad_norm_var": 0.23873697916666667, + "learning_rate": 0.0001, + "loss": 10.0812, + "loss/crossentropy": 2.3333388566970825, + "loss/hidden": 4.046875, + "loss/jsd": 0.0, + "loss/logits": 0.35471296310424805, + "step": 368 + }, + { + "epoch": 0.023125, + "grad_norm": 7.78125, + "grad_norm_var": 0.6998697916666666, + "learning_rate": 0.0001, + "loss": 10.6427, + "loss/crossentropy": 2.568707585334778, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3596802055835724, + "step": 370 + }, + { + "epoch": 0.02325, + "grad_norm": 5.3125, + "grad_norm_var": 0.73492431640625, + "learning_rate": 0.0001, + "loss": 10.3651, + "loss/crossentropy": 2.393819808959961, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.3487332612276077, + "step": 372 + }, + { + "epoch": 0.023375, + "grad_norm": 5.03125, + "grad_norm_var": 0.7124348958333333, + "learning_rate": 0.0001, + "loss": 10.2767, + "loss/crossentropy": 2.472244381904602, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.40489913523197174, + "step": 374 + }, + { + "epoch": 0.0235, + "grad_norm": 4.78125, + "grad_norm_var": 0.7247029622395833, + "learning_rate": 0.0001, + "loss": 10.1461, + "loss/crossentropy": 2.2458752393722534, + "loss/hidden": 4.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3645085096359253, + "step": 376 + }, + { + "epoch": 0.023625, + "grad_norm": 4.40625, + "grad_norm_var": 0.77890625, + "learning_rate": 0.0001, + "loss": 10.2872, + "loss/crossentropy": 2.1981548070907593, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.38387705385684967, + "step": 378 + }, + { + "epoch": 0.02375, + "grad_norm": 4.84375, + "grad_norm_var": 0.7771443684895833, + "learning_rate": 0.0001, + "loss": 10.1281, + "loss/crossentropy": 2.4362692832946777, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.3760421574115753, + "step": 380 + }, + { + "epoch": 0.023875, + "grad_norm": 4.65625, + "grad_norm_var": 0.7679972330729167, + "learning_rate": 0.0001, + "loss": 10.2042, + "loss/crossentropy": 2.4601200819015503, + "loss/hidden": 4.125, + "loss/jsd": 0.0, + "loss/logits": 0.3811968266963959, + "step": 382 + }, + { + "epoch": 0.024, + "grad_norm": 4.875, + "grad_norm_var": 0.684228515625, + "learning_rate": 0.0001, + "loss": 10.2408, + "loss/crossentropy": 2.6233471632003784, + "loss/hidden": 4.046875, + "loss/jsd": 0.0, + "loss/logits": 0.36839838325977325, + "step": 384 + }, + { + "epoch": 0.024125, + "grad_norm": 4.6875, + "grad_norm_var": 0.165478515625, + "learning_rate": 0.0001, + "loss": 10.1497, + "loss/crossentropy": 2.522361993789673, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3934475779533386, + "step": 386 + }, + { + "epoch": 0.02425, + "grad_norm": 4.5, + "grad_norm_var": 0.14563395182291666, + "learning_rate": 0.0001, + "loss": 10.1857, + "loss/crossentropy": 2.573809266090393, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.375105544924736, + "step": 388 + }, + { + "epoch": 0.024375, + "grad_norm": 4.40625, + "grad_norm_var": 0.13203125, + "learning_rate": 0.0001, + "loss": 10.0545, + "loss/crossentropy": 2.458760142326355, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.348113551735878, + "step": 390 + }, + { + "epoch": 0.0245, + "grad_norm": 4.8125, + "grad_norm_var": 0.13248291015625, + "learning_rate": 0.0001, + "loss": 10.1765, + "loss/crossentropy": 2.8183611631393433, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.39342811703681946, + "step": 392 + }, + { + "epoch": 0.024625, + "grad_norm": 4.65625, + "grad_norm_var": 0.11769205729166667, + "learning_rate": 0.0001, + "loss": 10.0009, + "loss/crossentropy": 2.2332464456558228, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.3468857705593109, + "step": 394 + }, + { + "epoch": 0.02475, + "grad_norm": 4.6875, + "grad_norm_var": 0.14143473307291668, + "learning_rate": 0.0001, + "loss": 9.9097, + "loss/crossentropy": 2.3098992109298706, + "loss/hidden": 4.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3603272885084152, + "step": 396 + }, + { + "epoch": 0.024875, + "grad_norm": 4.46875, + "grad_norm_var": 0.15129801432291667, + "learning_rate": 0.0001, + "loss": 10.1234, + "loss/crossentropy": 2.1571128964424133, + "loss/hidden": 4.1875, + "loss/jsd": 0.0, + "loss/logits": 0.3672170788049698, + "step": 398 + }, + { + "epoch": 0.025, + "grad_norm": 4.8125, + "grad_norm_var": 0.11464436848958333, + "learning_rate": 0.0001, + "loss": 10.1366, + "loss/crossentropy": 2.3940361738204956, + "loss/hidden": 3.984375, + "loss/jsd": 0.0, + "loss/logits": 0.3588118702173233, + "step": 400 + }, + { + "epoch": 0.025125, + "grad_norm": 4.3125, + "grad_norm_var": 0.1046875, + "learning_rate": 0.0001, + "loss": 10.248, + "loss/crossentropy": 2.4594138860702515, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3485400527715683, + "step": 402 + }, + { + "epoch": 0.02525, + "grad_norm": 4.9375, + "grad_norm_var": 0.10377197265625, + "learning_rate": 0.0001, + "loss": 10.0681, + "loss/crossentropy": 2.524109125137329, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3557968735694885, + "step": 404 + }, + { + "epoch": 0.025375, + "grad_norm": 5.03125, + "grad_norm_var": 0.10859375, + "learning_rate": 0.0001, + "loss": 10.0777, + "loss/crossentropy": 2.3012895584106445, + "loss/hidden": 4.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.34439629316329956, + "step": 406 + }, + { + "epoch": 0.0255, + "grad_norm": 4.28125, + "grad_norm_var": 0.11404622395833333, + "learning_rate": 0.0001, + "loss": 9.8043, + "loss/crossentropy": 2.4211593866348267, + "loss/hidden": 3.90625, + "loss/jsd": 0.0, + "loss/logits": 0.3732890635728836, + "step": 408 + }, + { + "epoch": 0.025625, + "grad_norm": 4.40625, + "grad_norm_var": 0.11438395182291666, + "learning_rate": 0.0001, + "loss": 9.9315, + "loss/crossentropy": 2.582412362098694, + "loss/hidden": 3.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.36784547567367554, + "step": 410 + }, + { + "epoch": 0.02575, + "grad_norm": 4.0625, + "grad_norm_var": 0.07857666015625, + "learning_rate": 0.0001, + "loss": 9.9946, + "loss/crossentropy": 2.525751233100891, + "loss/hidden": 3.953125, + "loss/jsd": 0.0, + "loss/logits": 0.3565828502178192, + "step": 412 + }, + { + "epoch": 0.025875, + "grad_norm": 4.28125, + "grad_norm_var": 0.092578125, + "learning_rate": 0.0001, + "loss": 10.2235, + "loss/crossentropy": 2.670364737510681, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.4017476439476013, + "step": 414 + }, + { + "epoch": 0.026, + "grad_norm": 4.4375, + "grad_norm_var": 0.08396809895833333, + "learning_rate": 0.0001, + "loss": 9.8333, + "loss/crossentropy": 2.3298041820526123, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.3325035125017166, + "step": 416 + }, + { + "epoch": 0.026125, + "grad_norm": 4.34375, + "grad_norm_var": 0.08318684895833334, + "learning_rate": 0.0001, + "loss": 10.0788, + "loss/crossentropy": 2.240808844566345, + "loss/hidden": 3.984375, + "loss/jsd": 0.0, + "loss/logits": 0.33864179253578186, + "step": 418 + }, + { + "epoch": 0.02625, + "grad_norm": 4.4375, + "grad_norm_var": 0.06643473307291667, + "learning_rate": 0.0001, + "loss": 9.933, + "loss/crossentropy": 2.397716999053955, + "loss/hidden": 3.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.3628341108560562, + "step": 420 + }, + { + "epoch": 0.026375, + "grad_norm": 4.375, + "grad_norm_var": 0.06901041666666667, + "learning_rate": 0.0001, + "loss": 10.0998, + "loss/crossentropy": 2.4601176977157593, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.3522925227880478, + "step": 422 + }, + { + "epoch": 0.0265, + "grad_norm": 5.5, + "grad_norm_var": 0.15636393229166667, + "learning_rate": 0.0001, + "loss": 10.1351, + "loss/crossentropy": 2.4023250341415405, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.37351013720035553, + "step": 424 + }, + { + "epoch": 0.026625, + "grad_norm": 4.71875, + "grad_norm_var": 0.16990559895833332, + "learning_rate": 0.0001, + "loss": 10.0776, + "loss/crossentropy": 2.271855592727661, + "loss/hidden": 4.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3706711381673813, + "step": 426 + }, + { + "epoch": 0.02675, + "grad_norm": 3.953125, + "grad_norm_var": 0.17908426920572917, + "learning_rate": 0.0001, + "loss": 9.7587, + "loss/crossentropy": 2.0610432028770447, + "loss/hidden": 4.125, + "loss/jsd": 0.0, + "loss/logits": 0.3243980407714844, + "step": 428 + }, + { + "epoch": 0.026875, + "grad_norm": 5.21875, + "grad_norm_var": 0.20093485514322917, + "learning_rate": 0.0001, + "loss": 9.8788, + "loss/crossentropy": 2.4309468269348145, + "loss/hidden": 4.0, + "loss/jsd": 0.0, + "loss/logits": 0.4062986671924591, + "step": 430 + }, + { + "epoch": 0.027, + "grad_norm": 4.65625, + "grad_norm_var": 0.20027567545572916, + "learning_rate": 0.0001, + "loss": 10.0082, + "loss/crossentropy": 2.4598418474197388, + "loss/hidden": 3.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.4272041916847229, + "step": 432 + }, + { + "epoch": 0.027125, + "grad_norm": 5.03125, + "grad_norm_var": 0.21314188639322917, + "learning_rate": 0.0001, + "loss": 9.9003, + "loss/crossentropy": 2.539934992790222, + "loss/hidden": 3.953125, + "loss/jsd": 0.0, + "loss/logits": 0.35184885561466217, + "step": 434 + }, + { + "epoch": 0.02725, + "grad_norm": 4.625, + "grad_norm_var": 0.20462137858072918, + "learning_rate": 0.0001, + "loss": 9.9416, + "loss/crossentropy": 2.269451856613159, + "loss/hidden": 3.90625, + "loss/jsd": 0.0, + "loss/logits": 0.35347896814346313, + "step": 436 + }, + { + "epoch": 0.027375, + "grad_norm": 4.34375, + "grad_norm_var": 0.1935455322265625, + "learning_rate": 0.0001, + "loss": 9.9578, + "loss/crossentropy": 2.281239867210388, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3409070521593094, + "step": 438 + }, + { + "epoch": 0.0275, + "grad_norm": 4.1875, + "grad_norm_var": 0.1381988525390625, + "learning_rate": 0.0001, + "loss": 9.8183, + "loss/crossentropy": 2.2763094305992126, + "loss/hidden": 3.890625, + "loss/jsd": 0.0, + "loss/logits": 0.3811110109090805, + "step": 440 + }, + { + "epoch": 0.027625, + "grad_norm": 4.59375, + "grad_norm_var": 0.1252593994140625, + "learning_rate": 0.0001, + "loss": 9.833, + "loss/crossentropy": 2.5895315408706665, + "loss/hidden": 4.0, + "loss/jsd": 0.0, + "loss/logits": 0.36611178517341614, + "step": 442 + }, + { + "epoch": 0.02775, + "grad_norm": 4.21875, + "grad_norm_var": 0.10857747395833334, + "learning_rate": 0.0001, + "loss": 9.7918, + "loss/crossentropy": 2.1158281564712524, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3374405652284622, + "step": 444 + }, + { + "epoch": 0.027875, + "grad_norm": 5.96875, + "grad_norm_var": 0.20597330729166666, + "learning_rate": 0.0001, + "loss": 9.9942, + "loss/crossentropy": 2.446805953979492, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.37695541977882385, + "step": 446 + }, + { + "epoch": 0.028, + "grad_norm": 5.3125, + "grad_norm_var": 0.24869791666666666, + "learning_rate": 0.0001, + "loss": 10.0719, + "loss/crossentropy": 2.5359551906585693, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.3757418543100357, + "step": 448 + }, + { + "epoch": 0.028125, + "grad_norm": 4.125, + "grad_norm_var": 0.24947916666666667, + "learning_rate": 0.0001, + "loss": 9.8412, + "loss/crossentropy": 2.656207323074341, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.42010098695755005, + "step": 450 + }, + { + "epoch": 0.02825, + "grad_norm": 5.125, + "grad_norm_var": 0.26568603515625, + "learning_rate": 0.0001, + "loss": 9.8241, + "loss/crossentropy": 2.4152743816375732, + "loss/hidden": 3.953125, + "loss/jsd": 0.0, + "loss/logits": 0.3626774847507477, + "step": 452 + }, + { + "epoch": 0.028375, + "grad_norm": 4.3125, + "grad_norm_var": 0.26858317057291664, + "learning_rate": 0.0001, + "loss": 10.136, + "loss/crossentropy": 2.4247626066207886, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.36247318983078003, + "step": 454 + }, + { + "epoch": 0.0285, + "grad_norm": 4.9375, + "grad_norm_var": 0.2647939046223958, + "learning_rate": 0.0001, + "loss": 9.8054, + "loss/crossentropy": 2.523893713951111, + "loss/hidden": 3.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.3691753149032593, + "step": 456 + }, + { + "epoch": 0.028625, + "grad_norm": 5.875, + "grad_norm_var": 0.6781646728515625, + "learning_rate": 0.0001, + "loss": 10.0115, + "loss/crossentropy": 2.2744319438934326, + "loss/hidden": 3.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.3499785512685776, + "step": 458 + }, + { + "epoch": 0.02875, + "grad_norm": 4.96875, + "grad_norm_var": 0.7750885009765625, + "learning_rate": 0.0001, + "loss": 9.8553, + "loss/crossentropy": 2.383001685142517, + "loss/hidden": 3.90625, + "loss/jsd": 0.0, + "loss/logits": 0.34379810094833374, + "step": 460 + }, + { + "epoch": 0.028875, + "grad_norm": 4.28125, + "grad_norm_var": 0.7739084879557292, + "learning_rate": 0.0001, + "loss": 9.9509, + "loss/crossentropy": 2.269408345222473, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.39304040372371674, + "step": 462 + }, + { + "epoch": 0.029, + "grad_norm": 4.625, + "grad_norm_var": 0.833544921875, + "learning_rate": 0.0001, + "loss": 9.8895, + "loss/crossentropy": 2.305214285850525, + "loss/hidden": 3.96875, + "loss/jsd": 0.0, + "loss/logits": 0.32623225450515747, + "step": 464 + }, + { + "epoch": 0.029125, + "grad_norm": 4.21875, + "grad_norm_var": 0.8327799479166667, + "learning_rate": 0.0001, + "loss": 9.9362, + "loss/crossentropy": 2.5358622074127197, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.35859355330467224, + "step": 466 + }, + { + "epoch": 0.02925, + "grad_norm": 4.3125, + "grad_norm_var": 0.8972981770833334, + "learning_rate": 0.0001, + "loss": 9.9957, + "loss/crossentropy": 2.490285634994507, + "loss/hidden": 4.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.35838285088539124, + "step": 468 + }, + { + "epoch": 0.029375, + "grad_norm": 4.84375, + "grad_norm_var": 0.8911417643229167, + "learning_rate": 0.0001, + "loss": 9.9963, + "loss/crossentropy": 2.388529062271118, + "loss/hidden": 3.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.3805827349424362, + "step": 470 + }, + { + "epoch": 0.0295, + "grad_norm": 4.09375, + "grad_norm_var": 0.8791575113932292, + "learning_rate": 0.0001, + "loss": 9.9093, + "loss/crossentropy": 2.464060425758362, + "loss/hidden": 3.875, + "loss/jsd": 0.0, + "loss/logits": 0.35019560158252716, + "step": 472 + }, + { + "epoch": 0.029625, + "grad_norm": 5.46875, + "grad_norm_var": 0.37202046712239584, + "learning_rate": 0.0001, + "loss": 9.8783, + "loss/crossentropy": 2.3649709224700928, + "loss/hidden": 3.96875, + "loss/jsd": 0.0, + "loss/logits": 0.3797526955604553, + "step": 474 + }, + { + "epoch": 0.02975, + "grad_norm": 4.25, + "grad_norm_var": 0.29755757649739584, + "learning_rate": 0.0001, + "loss": 9.8323, + "loss/crossentropy": 2.4717437028884888, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.35063043236732483, + "step": 476 + }, + { + "epoch": 0.029875, + "grad_norm": 4.34375, + "grad_norm_var": 0.16384175618489583, + "learning_rate": 0.0001, + "loss": 9.9292, + "loss/crossentropy": 2.5398319959640503, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.37523798644542694, + "step": 478 + }, + { + "epoch": 0.03, + "grad_norm": 4.40625, + "grad_norm_var": 0.14685872395833333, + "learning_rate": 0.0001, + "loss": 9.7299, + "loss/crossentropy": 2.080340564250946, + "loss/hidden": 3.953125, + "loss/jsd": 0.0, + "loss/logits": 0.33192941546440125, + "step": 480 + }, + { + "epoch": 0.030125, + "grad_norm": 4.46875, + "grad_norm_var": 0.14511311848958333, + "learning_rate": 0.0001, + "loss": 9.6576, + "loss/crossentropy": 2.2822307348251343, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.336023285984993, + "step": 482 + }, + { + "epoch": 0.03025, + "grad_norm": 4.34375, + "grad_norm_var": 0.1125, + "learning_rate": 0.0001, + "loss": 9.5694, + "loss/crossentropy": 2.286174952983856, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.3336353003978729, + "step": 484 + }, + { + "epoch": 0.030375, + "grad_norm": 4.6875, + "grad_norm_var": 0.13880208333333333, + "learning_rate": 0.0001, + "loss": 9.7847, + "loss/crossentropy": 2.2369834184646606, + "loss/hidden": 3.875, + "loss/jsd": 0.0, + "loss/logits": 0.32461969554424286, + "step": 486 + }, + { + "epoch": 0.0305, + "grad_norm": 3.828125, + "grad_norm_var": 0.16288960774739583, + "learning_rate": 0.0001, + "loss": 9.7289, + "loss/crossentropy": 2.3086917400360107, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.321616530418396, + "step": 488 + }, + { + "epoch": 0.030625, + "grad_norm": 4.21875, + "grad_norm_var": 0.09160054524739583, + "learning_rate": 0.0001, + "loss": 9.8277, + "loss/crossentropy": 2.3445401191711426, + "loss/hidden": 3.859375, + "loss/jsd": 0.0, + "loss/logits": 0.3501063734292984, + "step": 490 + }, + { + "epoch": 0.03075, + "grad_norm": 4.46875, + "grad_norm_var": 0.0995513916015625, + "learning_rate": 0.0001, + "loss": 9.611, + "loss/crossentropy": 1.9773722887039185, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.30824264883995056, + "step": 492 + }, + { + "epoch": 0.030875, + "grad_norm": 4.25, + "grad_norm_var": 0.09944559733072916, + "learning_rate": 0.0001, + "loss": 9.5735, + "loss/crossentropy": 2.428261160850525, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.35596518218517303, + "step": 494 + }, + { + "epoch": 0.031, + "grad_norm": 4.125, + "grad_norm_var": 0.09492085774739584, + "learning_rate": 0.0001, + "loss": 9.7677, + "loss/crossentropy": 2.262718915939331, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.33079805970191956, + "step": 496 + }, + { + "epoch": 0.031125, + "grad_norm": 4.5, + "grad_norm_var": 0.10596415201822916, + "learning_rate": 0.0001, + "loss": 9.7701, + "loss/crossentropy": 2.3702725172042847, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.3400324583053589, + "step": 498 + }, + { + "epoch": 0.03125, + "grad_norm": 3.84375, + "grad_norm_var": 0.13961588541666667, + "learning_rate": 0.0001, + "loss": 9.5602, + "loss/crossentropy": 2.295218586921692, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.3482416272163391, + "step": 500 + }, + { + "epoch": 0.031375, + "grad_norm": 5.34375, + "grad_norm_var": 0.15602213541666668, + "learning_rate": 0.0001, + "loss": 10.0544, + "loss/crossentropy": 2.445479154586792, + "loss/hidden": 3.828125, + "loss/jsd": 0.0, + "loss/logits": 0.37152746319770813, + "step": 502 + }, + { + "epoch": 0.0315, + "grad_norm": 4.9375, + "grad_norm_var": 0.1950836181640625, + "learning_rate": 0.0001, + "loss": 9.5511, + "loss/crossentropy": 2.223459005355835, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.32452794909477234, + "step": 504 + }, + { + "epoch": 0.031625, + "grad_norm": 4.5625, + "grad_norm_var": 0.19709370930989584, + "learning_rate": 0.0001, + "loss": 9.8003, + "loss/crossentropy": 2.6400363445281982, + "loss/hidden": 3.921875, + "loss/jsd": 0.0, + "loss/logits": 0.3358649015426636, + "step": 506 + }, + { + "epoch": 0.03175, + "grad_norm": 4.1875, + "grad_norm_var": 0.20300191243489582, + "learning_rate": 0.0001, + "loss": 9.7514, + "loss/crossentropy": 2.548031210899353, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.3170333355665207, + "step": 508 + }, + { + "epoch": 0.031875, + "grad_norm": 4.8125, + "grad_norm_var": 0.21568094889322917, + "learning_rate": 0.0001, + "loss": 9.8207, + "loss/crossentropy": 2.2956899404525757, + "loss/hidden": 3.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.3392469882965088, + "step": 510 + }, + { + "epoch": 0.032, + "grad_norm": 4.34375, + "grad_norm_var": 0.22424723307291666, + "learning_rate": 0.0001, + "loss": 9.6765, + "loss/crossentropy": 2.353795349597931, + "loss/hidden": 3.875, + "loss/jsd": 0.0, + "loss/logits": 0.33590464293956757, + "step": 512 + }, + { + "epoch": 0.032125, + "grad_norm": 3.796875, + "grad_norm_var": 0.23311258951822916, + "learning_rate": 0.0001, + "loss": 9.5705, + "loss/crossentropy": 2.3682990074157715, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.3214150220155716, + "step": 514 + }, + { + "epoch": 0.03225, + "grad_norm": 4.375, + "grad_norm_var": 0.19153238932291666, + "learning_rate": 0.0001, + "loss": 9.5203, + "loss/crossentropy": 2.2625592947006226, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.35527725517749786, + "step": 516 + }, + { + "epoch": 0.032375, + "grad_norm": 4.78125, + "grad_norm_var": 0.16765950520833334, + "learning_rate": 0.0001, + "loss": 9.7962, + "loss/crossentropy": 2.4448131322860718, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.33070215582847595, + "step": 518 + }, + { + "epoch": 0.0325, + "grad_norm": 3.875, + "grad_norm_var": 0.143310546875, + "learning_rate": 0.0001, + "loss": 9.6837, + "loss/crossentropy": 2.3028098344802856, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.34870584309101105, + "step": 520 + }, + { + "epoch": 0.032625, + "grad_norm": 4.09375, + "grad_norm_var": 0.16155497233072916, + "learning_rate": 0.0001, + "loss": 9.5805, + "loss/crossentropy": 2.181080639362335, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.332154244184494, + "step": 522 + }, + { + "epoch": 0.03275, + "grad_norm": 4.09375, + "grad_norm_var": 0.1529937744140625, + "learning_rate": 0.0001, + "loss": 9.5179, + "loss/crossentropy": 2.337011694908142, + "loss/hidden": 3.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.3419201970100403, + "step": 524 + }, + { + "epoch": 0.032875, + "grad_norm": 4.5, + "grad_norm_var": 0.3694976806640625, + "learning_rate": 0.0001, + "loss": 9.6365, + "loss/crossentropy": 2.354939341545105, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.35222889482975006, + "step": 526 + }, + { + "epoch": 0.033, + "grad_norm": 3.8125, + "grad_norm_var": 0.41142476399739586, + "learning_rate": 0.0001, + "loss": 9.4276, + "loss/crossentropy": 2.2241241931915283, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.32786163687705994, + "step": 528 + }, + { + "epoch": 0.033125, + "grad_norm": 4.40625, + "grad_norm_var": 0.3993886311848958, + "learning_rate": 0.0001, + "loss": 9.5245, + "loss/crossentropy": 2.4617063999176025, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.36403751373291016, + "step": 530 + }, + { + "epoch": 0.03325, + "grad_norm": 4.15625, + "grad_norm_var": 0.4066396077473958, + "learning_rate": 0.0001, + "loss": 9.5201, + "loss/crossentropy": 2.2278032302856445, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.3158974349498749, + "step": 532 + }, + { + "epoch": 0.033375, + "grad_norm": 4.6875, + "grad_norm_var": 0.3785634358723958, + "learning_rate": 0.0001, + "loss": 9.6706, + "loss/crossentropy": 2.464658737182617, + "loss/hidden": 3.859375, + "loss/jsd": 0.0, + "loss/logits": 0.3708791136741638, + "step": 534 + }, + { + "epoch": 0.0335, + "grad_norm": 4.25, + "grad_norm_var": 0.34780985514322915, + "learning_rate": 0.0001, + "loss": 9.4853, + "loss/crossentropy": 2.659584403038025, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.3422502875328064, + "step": 536 + }, + { + "epoch": 0.033625, + "grad_norm": 4.03125, + "grad_norm_var": 0.3409830729166667, + "learning_rate": 0.0001, + "loss": 9.5004, + "loss/crossentropy": 2.3510810136795044, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.3363768756389618, + "step": 538 + }, + { + "epoch": 0.03375, + "grad_norm": 4.0625, + "grad_norm_var": 0.3513336181640625, + "learning_rate": 0.0001, + "loss": 9.6363, + "loss/crossentropy": 2.4384061098098755, + "loss/hidden": 3.90625, + "loss/jsd": 0.0, + "loss/logits": 0.3327721059322357, + "step": 540 + }, + { + "epoch": 0.033875, + "grad_norm": 4.46875, + "grad_norm_var": 0.08550516764322917, + "learning_rate": 0.0001, + "loss": 9.628, + "loss/crossentropy": 2.435794949531555, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.3326384872198105, + "step": 542 + }, + { + "epoch": 0.034, + "grad_norm": 4.03125, + "grad_norm_var": 0.06864827473958333, + "learning_rate": 0.0001, + "loss": 9.6422, + "loss/crossentropy": 2.430909752845764, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.33669474720954895, + "step": 544 + }, + { + "epoch": 0.034125, + "grad_norm": 4.3125, + "grad_norm_var": 0.06433817545572916, + "learning_rate": 0.0001, + "loss": 9.4545, + "loss/crossentropy": 2.4339792728424072, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.33400970697402954, + "step": 546 + }, + { + "epoch": 0.03425, + "grad_norm": 3.78125, + "grad_norm_var": 0.060334269205729166, + "learning_rate": 0.0001, + "loss": 9.6066, + "loss/crossentropy": 2.601755380630493, + "loss/hidden": 3.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.3572891056537628, + "step": 548 + }, + { + "epoch": 0.034375, + "grad_norm": 4.75, + "grad_norm_var": 0.06419169108072917, + "learning_rate": 0.0001, + "loss": 9.5364, + "loss/crossentropy": 2.263180732727051, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.33265452086925507, + "step": 550 + }, + { + "epoch": 0.0345, + "grad_norm": 4.21875, + "grad_norm_var": 0.05921122233072917, + "learning_rate": 0.0001, + "loss": 9.4317, + "loss/crossentropy": 2.6668169498443604, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.3484792411327362, + "step": 552 + }, + { + "epoch": 0.034625, + "grad_norm": 4.0, + "grad_norm_var": 0.05729878743489583, + "learning_rate": 0.0001, + "loss": 9.5416, + "loss/crossentropy": 2.488753318786621, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3451061546802521, + "step": 554 + }, + { + "epoch": 0.03475, + "grad_norm": 4.0625, + "grad_norm_var": 0.06444905598958334, + "learning_rate": 0.0001, + "loss": 9.3819, + "loss/crossentropy": 2.5572561025619507, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.3529306650161743, + "step": 556 + }, + { + "epoch": 0.034875, + "grad_norm": 4.46875, + "grad_norm_var": 0.06443684895833333, + "learning_rate": 0.0001, + "loss": 9.6456, + "loss/crossentropy": 2.3274362087249756, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.3618105351924896, + "step": 558 + }, + { + "epoch": 0.035, + "grad_norm": 4.34375, + "grad_norm_var": 0.059370930989583334, + "learning_rate": 0.0001, + "loss": 9.4529, + "loss/crossentropy": 2.5755836963653564, + "loss/hidden": 3.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.3374823033809662, + "step": 560 + }, + { + "epoch": 0.035125, + "grad_norm": 3.640625, + "grad_norm_var": 0.0757232666015625, + "learning_rate": 0.0001, + "loss": 9.5006, + "loss/crossentropy": 2.291811466217041, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.33205731213092804, + "step": 562 + }, + { + "epoch": 0.03525, + "grad_norm": 4.125, + "grad_norm_var": 0.07857666015625, + "learning_rate": 0.0001, + "loss": 9.4801, + "loss/crossentropy": 2.378532886505127, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.31333786249160767, + "step": 564 + }, + { + "epoch": 0.035375, + "grad_norm": 4.46875, + "grad_norm_var": 0.07685546875, + "learning_rate": 0.0001, + "loss": 9.425, + "loss/crossentropy": 2.368729591369629, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.29723505675792694, + "step": 566 + }, + { + "epoch": 0.0355, + "grad_norm": 3.890625, + "grad_norm_var": 0.0939605712890625, + "learning_rate": 0.0001, + "loss": 9.3256, + "loss/crossentropy": 2.359733462333679, + "loss/hidden": 3.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.3121738135814667, + "step": 568 + }, + { + "epoch": 0.035625, + "grad_norm": 4.9375, + "grad_norm_var": 0.1553131103515625, + "learning_rate": 0.0001, + "loss": 9.4416, + "loss/crossentropy": 2.315782904624939, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3432431221008301, + "step": 570 + }, + { + "epoch": 0.03575, + "grad_norm": 3.3125, + "grad_norm_var": 0.19507548014322917, + "learning_rate": 0.0001, + "loss": 9.5614, + "loss/crossentropy": 2.3565382957458496, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.32801851630210876, + "step": 572 + }, + { + "epoch": 0.035875, + "grad_norm": 4.25, + "grad_norm_var": 0.19041239420572917, + "learning_rate": 0.0001, + "loss": 9.3502, + "loss/crossentropy": 2.354498505592346, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.34335146844387054, + "step": 574 + }, + { + "epoch": 0.036, + "grad_norm": 4.0625, + "grad_norm_var": 0.1861968994140625, + "learning_rate": 0.0001, + "loss": 9.4591, + "loss/crossentropy": 2.2208141088485718, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.32215404510498047, + "step": 576 + }, + { + "epoch": 0.036125, + "grad_norm": 3.953125, + "grad_norm_var": 0.17675374348958334, + "learning_rate": 0.0001, + "loss": 9.5684, + "loss/crossentropy": 2.1560009717941284, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3317830264568329, + "step": 578 + }, + { + "epoch": 0.03625, + "grad_norm": 3.84375, + "grad_norm_var": 0.1708648681640625, + "learning_rate": 0.0001, + "loss": 9.384, + "loss/crossentropy": 2.254258155822754, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.34166762232780457, + "step": 580 + }, + { + "epoch": 0.036375, + "grad_norm": 3.90625, + "grad_norm_var": 0.14008687337239584, + "learning_rate": 0.0001, + "loss": 9.4391, + "loss/crossentropy": 2.193941831588745, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.3121718168258667, + "step": 582 + }, + { + "epoch": 0.0365, + "grad_norm": 4.1875, + "grad_norm_var": 0.13092041015625, + "learning_rate": 0.0001, + "loss": 9.5948, + "loss/crossentropy": 2.610209345817566, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.35641224682331085, + "step": 584 + }, + { + "epoch": 0.036625, + "grad_norm": 3.9375, + "grad_norm_var": 0.04394124348958333, + "learning_rate": 0.0001, + "loss": 9.334, + "loss/crossentropy": 2.173751473426819, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.3093992620706558, + "step": 586 + }, + { + "epoch": 0.03675, + "grad_norm": 3.734375, + "grad_norm_var": 0.021141560872395833, + "learning_rate": 0.0001, + "loss": 9.3647, + "loss/crossentropy": 2.6784013509750366, + "loss/hidden": 3.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.3435199409723282, + "step": 588 + }, + { + "epoch": 0.036875, + "grad_norm": 4.375, + "grad_norm_var": 0.028416951497395832, + "learning_rate": 0.0001, + "loss": 9.6858, + "loss/crossentropy": 2.5606144666671753, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.3252936899662018, + "step": 590 + }, + { + "epoch": 0.037, + "grad_norm": 4.28125, + "grad_norm_var": 0.07421773274739583, + "learning_rate": 0.0001, + "loss": 9.1905, + "loss/crossentropy": 2.5036474466323853, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.3341420292854309, + "step": 592 + }, + { + "epoch": 0.037125, + "grad_norm": 4.0, + "grad_norm_var": 0.07280171712239583, + "learning_rate": 0.0001, + "loss": 9.2089, + "loss/crossentropy": 2.138327479362488, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3032035082578659, + "step": 594 + }, + { + "epoch": 0.03725, + "grad_norm": 3.296875, + "grad_norm_var": 0.11728515625, + "learning_rate": 0.0001, + "loss": 9.1876, + "loss/crossentropy": 2.0817145109176636, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.28961437940597534, + "step": 596 + }, + { + "epoch": 0.037375, + "grad_norm": 4.40625, + "grad_norm_var": 0.127685546875, + "learning_rate": 0.0001, + "loss": 9.5, + "loss/crossentropy": 2.231162667274475, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.32542233169078827, + "step": 598 + }, + { + "epoch": 0.0375, + "grad_norm": 4.09375, + "grad_norm_var": 0.12625325520833333, + "learning_rate": 0.0001, + "loss": 9.4883, + "loss/crossentropy": 2.279044270515442, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.30954092741012573, + "step": 600 + }, + { + "epoch": 0.037625, + "grad_norm": 3.96875, + "grad_norm_var": 0.12911783854166667, + "learning_rate": 0.0001, + "loss": 9.5667, + "loss/crossentropy": 2.124338150024414, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.3287663906812668, + "step": 602 + }, + { + "epoch": 0.03775, + "grad_norm": 4.4375, + "grad_norm_var": 0.1248443603515625, + "learning_rate": 0.0001, + "loss": 9.5844, + "loss/crossentropy": 2.5788776874542236, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.34556926786899567, + "step": 604 + }, + { + "epoch": 0.037875, + "grad_norm": 3.390625, + "grad_norm_var": 0.15191650390625, + "learning_rate": 0.0001, + "loss": 9.4029, + "loss/crossentropy": 2.511660575866699, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.31718502938747406, + "step": 606 + }, + { + "epoch": 0.038, + "grad_norm": 4.6875, + "grad_norm_var": 0.13528238932291667, + "learning_rate": 0.0001, + "loss": 9.4312, + "loss/crossentropy": 2.558152675628662, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3450692296028137, + "step": 608 + }, + { + "epoch": 0.038125, + "grad_norm": 3.921875, + "grad_norm_var": 0.13547261555989584, + "learning_rate": 0.0001, + "loss": 9.2833, + "loss/crossentropy": 2.2010965943336487, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.33724747598171234, + "step": 610 + }, + { + "epoch": 0.03825, + "grad_norm": 3.90625, + "grad_norm_var": 0.098974609375, + "learning_rate": 0.0001, + "loss": 9.4903, + "loss/crossentropy": 2.499935030937195, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.3286616951227188, + "step": 612 + }, + { + "epoch": 0.038375, + "grad_norm": 4.0625, + "grad_norm_var": 0.08684488932291666, + "learning_rate": 0.0001, + "loss": 9.3714, + "loss/crossentropy": 2.29742568731308, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.3321594297885895, + "step": 614 + }, + { + "epoch": 0.0385, + "grad_norm": 4.4375, + "grad_norm_var": 0.10472005208333333, + "learning_rate": 0.0001, + "loss": 9.2124, + "loss/crossentropy": 2.2705591917037964, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.33022937178611755, + "step": 616 + }, + { + "epoch": 0.038625, + "grad_norm": 3.78125, + "grad_norm_var": 0.105615234375, + "learning_rate": 0.0001, + "loss": 9.3996, + "loss/crossentropy": 2.5177834033966064, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.33882059156894684, + "step": 618 + }, + { + "epoch": 0.03875, + "grad_norm": 3.9375, + "grad_norm_var": 0.09058329264322916, + "learning_rate": 0.0001, + "loss": 9.3251, + "loss/crossentropy": 2.3914138078689575, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.300808310508728, + "step": 620 + }, + { + "epoch": 0.038875, + "grad_norm": 4.84375, + "grad_norm_var": 0.107177734375, + "learning_rate": 0.0001, + "loss": 9.4366, + "loss/crossentropy": 2.4114054441452026, + "loss/hidden": 3.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.3568413257598877, + "step": 622 + }, + { + "epoch": 0.039, + "grad_norm": 4.03125, + "grad_norm_var": 0.08601786295572916, + "learning_rate": 0.0001, + "loss": 9.1418, + "loss/crossentropy": 2.3419090509414673, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.31717973947525024, + "step": 624 + }, + { + "epoch": 0.039125, + "grad_norm": 4.25, + "grad_norm_var": 0.09462890625, + "learning_rate": 0.0001, + "loss": 9.4373, + "loss/crossentropy": 2.64203143119812, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.33858008682727814, + "step": 626 + }, + { + "epoch": 0.03925, + "grad_norm": 3.625, + "grad_norm_var": 0.10220947265625, + "learning_rate": 0.0001, + "loss": 9.3278, + "loss/crossentropy": 2.0542389154434204, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.30543386936187744, + "step": 628 + }, + { + "epoch": 0.039375, + "grad_norm": 3.875, + "grad_norm_var": 0.13300679524739584, + "learning_rate": 0.0001, + "loss": 9.2397, + "loss/crossentropy": 2.4575854539871216, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.3102063983678818, + "step": 630 + }, + { + "epoch": 0.0395, + "grad_norm": 4.28125, + "grad_norm_var": 0.1220611572265625, + "learning_rate": 0.0001, + "loss": 9.3452, + "loss/crossentropy": 2.3602949380874634, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.323737695813179, + "step": 632 + }, + { + "epoch": 0.039625, + "grad_norm": 3.71875, + "grad_norm_var": 0.12845052083333333, + "learning_rate": 0.0001, + "loss": 9.2681, + "loss/crossentropy": 2.507497191429138, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3275406062602997, + "step": 634 + }, + { + "epoch": 0.03975, + "grad_norm": 4.25, + "grad_norm_var": 0.13587137858072917, + "learning_rate": 0.0001, + "loss": 9.36, + "loss/crossentropy": 2.2765142917633057, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.31396952271461487, + "step": 636 + }, + { + "epoch": 0.039875, + "grad_norm": 3.46875, + "grad_norm_var": 0.10793863932291667, + "learning_rate": 0.0001, + "loss": 9.2294, + "loss/crossentropy": 2.341191053390503, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.32686179876327515, + "step": 638 + }, + { + "epoch": 0.04, + "grad_norm": 4.8125, + "grad_norm_var": 0.1619781494140625, + "learning_rate": 0.0001, + "loss": 9.2556, + "loss/crossentropy": 2.252098858356476, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3205975890159607, + "step": 640 + }, + { + "epoch": 0.040125, + "grad_norm": 3.4375, + "grad_norm_var": 0.20991109212239584, + "learning_rate": 0.0001, + "loss": 9.3137, + "loss/crossentropy": 2.2994823455810547, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.34002968668937683, + "step": 642 + }, + { + "epoch": 0.04025, + "grad_norm": 4.375, + "grad_norm_var": 0.21840718587239583, + "learning_rate": 0.0001, + "loss": 9.1512, + "loss/crossentropy": 2.5480741262435913, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.33601297438144684, + "step": 644 + }, + { + "epoch": 0.040375, + "grad_norm": 3.578125, + "grad_norm_var": 0.23454488118489583, + "learning_rate": 0.0001, + "loss": 9.3808, + "loss/crossentropy": 2.524424910545349, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.3395262509584427, + "step": 646 + }, + { + "epoch": 0.0405, + "grad_norm": 3.75, + "grad_norm_var": 0.23319905598958332, + "learning_rate": 0.0001, + "loss": 9.1816, + "loss/crossentropy": 2.2198326587677, + "loss/hidden": 3.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.33485807478427887, + "step": 648 + }, + { + "epoch": 0.040625, + "grad_norm": 4.125, + "grad_norm_var": 0.23205973307291666, + "learning_rate": 0.0001, + "loss": 9.252, + "loss/crossentropy": 2.5186063051223755, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.33504799008369446, + "step": 650 + }, + { + "epoch": 0.04075, + "grad_norm": 3.5, + "grad_norm_var": 0.23567606608072916, + "learning_rate": 0.0001, + "loss": 9.299, + "loss/crossentropy": 2.3492661714553833, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.31913943588733673, + "step": 652 + }, + { + "epoch": 0.040875, + "grad_norm": 4.1875, + "grad_norm_var": 0.23007405598958333, + "learning_rate": 0.0001, + "loss": 9.2997, + "loss/crossentropy": 2.600319981575012, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.32478684186935425, + "step": 654 + }, + { + "epoch": 0.041, + "grad_norm": 3.859375, + "grad_norm_var": 0.17136942545572917, + "learning_rate": 0.0001, + "loss": 9.1048, + "loss/crossentropy": 2.335653781890869, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3278462737798691, + "step": 656 + }, + { + "epoch": 0.041125, + "grad_norm": 3.640625, + "grad_norm_var": 0.12332356770833333, + "learning_rate": 0.0001, + "loss": 9.2115, + "loss/crossentropy": 2.223168969154358, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.3341253995895386, + "step": 658 + }, + { + "epoch": 0.04125, + "grad_norm": 5.3125, + "grad_norm_var": 0.252490234375, + "learning_rate": 0.0001, + "loss": 9.1622, + "loss/crossentropy": 2.424691677093506, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3088841736316681, + "step": 660 + }, + { + "epoch": 0.041375, + "grad_norm": 4.625, + "grad_norm_var": 0.2993072509765625, + "learning_rate": 0.0001, + "loss": 9.1041, + "loss/crossentropy": 2.07004451751709, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.2844501733779907, + "step": 662 + }, + { + "epoch": 0.0415, + "grad_norm": 5.03125, + "grad_norm_var": 0.3312459309895833, + "learning_rate": 0.0001, + "loss": 9.3562, + "loss/crossentropy": 2.32711398601532, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3303475081920624, + "step": 664 + }, + { + "epoch": 0.041625, + "grad_norm": 4.3125, + "grad_norm_var": 0.48313700358072914, + "learning_rate": 0.0001, + "loss": 9.2399, + "loss/crossentropy": 2.3355051279067993, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.34253838658332825, + "step": 666 + }, + { + "epoch": 0.04175, + "grad_norm": 4.0, + "grad_norm_var": 0.504443359375, + "learning_rate": 0.0001, + "loss": 9.3712, + "loss/crossentropy": 2.423375368118286, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31640373170375824, + "step": 668 + }, + { + "epoch": 0.041875, + "grad_norm": 3.8125, + "grad_norm_var": 0.49081624348958336, + "learning_rate": 0.0001, + "loss": 9.1568, + "loss/crossentropy": 2.4355897903442383, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.3098563849925995, + "step": 670 + }, + { + "epoch": 0.042, + "grad_norm": 3.65625, + "grad_norm_var": 0.5179433186848958, + "learning_rate": 0.0001, + "loss": 9.0687, + "loss/crossentropy": 2.151113748550415, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.28571945428848267, + "step": 672 + }, + { + "epoch": 0.042125, + "grad_norm": 4.09375, + "grad_norm_var": 0.45859375, + "learning_rate": 0.0001, + "loss": 9.126, + "loss/crossentropy": 2.1033096313476562, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.30395573377609253, + "step": 674 + }, + { + "epoch": 0.04225, + "grad_norm": 3.703125, + "grad_norm_var": 0.41311848958333336, + "learning_rate": 0.0001, + "loss": 9.2339, + "loss/crossentropy": 2.384363532066345, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3477473706007004, + "step": 676 + }, + { + "epoch": 0.042375, + "grad_norm": 4.0, + "grad_norm_var": 0.35420633951822916, + "learning_rate": 0.0001, + "loss": 9.2017, + "loss/crossentropy": 2.3887627124786377, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3319309651851654, + "step": 678 + }, + { + "epoch": 0.0425, + "grad_norm": 3.796875, + "grad_norm_var": 0.28951416015625, + "learning_rate": 0.0001, + "loss": 9.0506, + "loss/crossentropy": 2.3131089210510254, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.31970134377479553, + "step": 680 + }, + { + "epoch": 0.042625, + "grad_norm": 3.515625, + "grad_norm_var": 0.0540191650390625, + "learning_rate": 0.0001, + "loss": 9.3033, + "loss/crossentropy": 2.2213594913482666, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2932916283607483, + "step": 682 + }, + { + "epoch": 0.04275, + "grad_norm": 3.9375, + "grad_norm_var": 0.04342041015625, + "learning_rate": 0.0001, + "loss": 9.1272, + "loss/crossentropy": 2.343689441680908, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.31054770946502686, + "step": 684 + }, + { + "epoch": 0.042875, + "grad_norm": 4.125, + "grad_norm_var": 0.07541910807291667, + "learning_rate": 0.0001, + "loss": 9.0333, + "loss/crossentropy": 2.426623225212097, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.30797363817691803, + "step": 686 + }, + { + "epoch": 0.043, + "grad_norm": 3.75, + "grad_norm_var": 0.07333577473958333, + "learning_rate": 0.0001, + "loss": 9.1961, + "loss/crossentropy": 2.1243041157722473, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.27504249662160873, + "step": 688 + }, + { + "epoch": 0.043125, + "grad_norm": 3.546875, + "grad_norm_var": 0.07789306640625, + "learning_rate": 0.0001, + "loss": 9.3228, + "loss/crossentropy": 2.2858647108078003, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.3059113025665283, + "step": 690 + }, + { + "epoch": 0.04325, + "grad_norm": 4.03125, + "grad_norm_var": 0.07827046712239584, + "learning_rate": 0.0001, + "loss": 9.2223, + "loss/crossentropy": 2.6258697509765625, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.32897868752479553, + "step": 692 + }, + { + "epoch": 0.043375, + "grad_norm": 4.15625, + "grad_norm_var": 0.0829986572265625, + "learning_rate": 0.0001, + "loss": 9.0107, + "loss/crossentropy": 2.3375871181488037, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.30802060663700104, + "step": 694 + }, + { + "epoch": 0.0435, + "grad_norm": 3.96875, + "grad_norm_var": 0.08065999348958333, + "learning_rate": 0.0001, + "loss": 9.1397, + "loss/crossentropy": 2.27328884601593, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.32508768141269684, + "step": 696 + }, + { + "epoch": 0.043625, + "grad_norm": 3.90625, + "grad_norm_var": 0.07095947265625, + "learning_rate": 0.0001, + "loss": 9.0871, + "loss/crossentropy": 2.3383896350860596, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3195807486772537, + "step": 698 + }, + { + "epoch": 0.04375, + "grad_norm": 3.609375, + "grad_norm_var": 0.07858784993489583, + "learning_rate": 0.0001, + "loss": 9.0647, + "loss/crossentropy": 1.985029935836792, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.2951173782348633, + "step": 700 + }, + { + "epoch": 0.043875, + "grad_norm": 3.5625, + "grad_norm_var": 0.04241129557291667, + "learning_rate": 0.0001, + "loss": 9.2963, + "loss/crossentropy": 2.236124038696289, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3033126890659332, + "step": 702 + }, + { + "epoch": 0.044, + "grad_norm": 3.9375, + "grad_norm_var": 0.03899332682291667, + "learning_rate": 0.0001, + "loss": 9.4002, + "loss/crossentropy": 2.3327068090438843, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.35688331723213196, + "step": 704 + }, + { + "epoch": 0.044125, + "grad_norm": 3.40625, + "grad_norm_var": 0.04415690104166667, + "learning_rate": 0.0001, + "loss": 9.1288, + "loss/crossentropy": 2.185292422771454, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.2997971922159195, + "step": 706 + }, + { + "epoch": 0.04425, + "grad_norm": 4.03125, + "grad_norm_var": 0.048193359375, + "learning_rate": 0.0001, + "loss": 9.1816, + "loss/crossentropy": 2.592350959777832, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3380052447319031, + "step": 708 + }, + { + "epoch": 0.044375, + "grad_norm": 3.625, + "grad_norm_var": 0.050837198893229164, + "learning_rate": 0.0001, + "loss": 9.2751, + "loss/crossentropy": 2.3508870601654053, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.30894704163074493, + "step": 710 + }, + { + "epoch": 0.0445, + "grad_norm": 3.703125, + "grad_norm_var": 0.05436909993489583, + "learning_rate": 0.0001, + "loss": 9.2454, + "loss/crossentropy": 2.4968451261520386, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.3285796344280243, + "step": 712 + }, + { + "epoch": 0.044625, + "grad_norm": 3.90625, + "grad_norm_var": 0.10283203125, + "learning_rate": 0.0001, + "loss": 9.3351, + "loss/crossentropy": 2.4106976985931396, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.32071977853775024, + "step": 714 + }, + { + "epoch": 0.04475, + "grad_norm": 3.609375, + "grad_norm_var": 0.09908447265625, + "learning_rate": 0.0001, + "loss": 9.0656, + "loss/crossentropy": 2.2552783489227295, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.30507735908031464, + "step": 716 + }, + { + "epoch": 0.044875, + "grad_norm": 3.96875, + "grad_norm_var": 0.09814046223958334, + "learning_rate": 0.0001, + "loss": 9.0835, + "loss/crossentropy": 2.4068862199783325, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31281837821006775, + "step": 718 + }, + { + "epoch": 0.045, + "grad_norm": 3.75, + "grad_norm_var": 0.17351888020833334, + "learning_rate": 0.0001, + "loss": 9.2535, + "loss/crossentropy": 2.2698957920074463, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3157753646373749, + "step": 720 + }, + { + "epoch": 0.045125, + "grad_norm": 4.25, + "grad_norm_var": 0.25741780598958336, + "learning_rate": 0.0001, + "loss": 9.2178, + "loss/crossentropy": 2.5466257333755493, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3196914494037628, + "step": 722 + }, + { + "epoch": 0.04525, + "grad_norm": 3.75, + "grad_norm_var": 0.2650553385416667, + "learning_rate": 0.0001, + "loss": 9.0596, + "loss/crossentropy": 2.259085774421692, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.30355823040008545, + "step": 724 + }, + { + "epoch": 0.045375, + "grad_norm": 3.828125, + "grad_norm_var": 0.2575836181640625, + "learning_rate": 0.0001, + "loss": 9.3315, + "loss/crossentropy": 2.420317769050598, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3066561073064804, + "step": 726 + }, + { + "epoch": 0.0455, + "grad_norm": 3.65625, + "grad_norm_var": 0.24367574055989583, + "learning_rate": 0.0001, + "loss": 9.074, + "loss/crossentropy": 2.244703531265259, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.30555886030197144, + "step": 728 + }, + { + "epoch": 0.045625, + "grad_norm": 3.875, + "grad_norm_var": 0.21819254557291667, + "learning_rate": 0.0001, + "loss": 9.1535, + "loss/crossentropy": 2.3010120391845703, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.30051296949386597, + "step": 730 + }, + { + "epoch": 0.04575, + "grad_norm": 3.8125, + "grad_norm_var": 0.2000885009765625, + "learning_rate": 0.0001, + "loss": 9.2806, + "loss/crossentropy": 2.0744789838790894, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3020750731229782, + "step": 732 + }, + { + "epoch": 0.045875, + "grad_norm": 4.875, + "grad_norm_var": 0.2525390625, + "learning_rate": 0.0001, + "loss": 9.1755, + "loss/crossentropy": 2.4247848987579346, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.30639201402664185, + "step": 734 + }, + { + "epoch": 0.046, + "grad_norm": 3.6875, + "grad_norm_var": 0.21112874348958333, + "learning_rate": 0.0001, + "loss": 9.2127, + "loss/crossentropy": 2.228127598762512, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.2969563454389572, + "step": 736 + }, + { + "epoch": 0.046125, + "grad_norm": 3.453125, + "grad_norm_var": 0.12919514973958332, + "learning_rate": 0.0001, + "loss": 8.998, + "loss/crossentropy": 2.3098256587982178, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.30860866606235504, + "step": 738 + }, + { + "epoch": 0.04625, + "grad_norm": 3.46875, + "grad_norm_var": 0.14089253743489583, + "learning_rate": 0.0001, + "loss": 8.9791, + "loss/crossentropy": 2.135189712047577, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.3053634464740753, + "step": 740 + }, + { + "epoch": 0.046375, + "grad_norm": 3.71875, + "grad_norm_var": 0.14488016764322917, + "learning_rate": 0.0001, + "loss": 9.0268, + "loss/crossentropy": 2.4625048637390137, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3120746314525604, + "step": 742 + }, + { + "epoch": 0.0465, + "grad_norm": 3.734375, + "grad_norm_var": 0.142333984375, + "learning_rate": 0.0001, + "loss": 9.1708, + "loss/crossentropy": 2.1217297315597534, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.2820632755756378, + "step": 744 + }, + { + "epoch": 0.046625, + "grad_norm": 3.453125, + "grad_norm_var": 0.16352437337239584, + "learning_rate": 0.0001, + "loss": 8.8857, + "loss/crossentropy": 2.454026937484741, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.28094005584716797, + "step": 746 + }, + { + "epoch": 0.04675, + "grad_norm": 4.5, + "grad_norm_var": 0.19820556640625, + "learning_rate": 0.0001, + "loss": 9.1644, + "loss/crossentropy": 2.413946032524109, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3286159932613373, + "step": 748 + }, + { + "epoch": 0.046875, + "grad_norm": 3.265625, + "grad_norm_var": 0.12294514973958333, + "learning_rate": 0.0001, + "loss": 8.881, + "loss/crossentropy": 2.2393068075180054, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.30563026666641235, + "step": 750 + }, + { + "epoch": 0.047, + "grad_norm": 3.25, + "grad_norm_var": 0.11814676920572917, + "learning_rate": 0.0001, + "loss": 8.8969, + "loss/crossentropy": 2.2508221864700317, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.2862202823162079, + "step": 752 + }, + { + "epoch": 0.047125, + "grad_norm": 4.03125, + "grad_norm_var": 0.13244527180989582, + "learning_rate": 0.0001, + "loss": 9.265, + "loss/crossentropy": 2.1571661233901978, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3307356685400009, + "step": 754 + }, + { + "epoch": 0.04725, + "grad_norm": 3.34375, + "grad_norm_var": 0.13481343587239583, + "learning_rate": 0.0001, + "loss": 8.9922, + "loss/crossentropy": 2.393697500228882, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.3230316936969757, + "step": 756 + }, + { + "epoch": 0.047375, + "grad_norm": 3.640625, + "grad_norm_var": 0.13456624348958332, + "learning_rate": 0.0001, + "loss": 8.9217, + "loss/crossentropy": 2.249446392059326, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.2918042242527008, + "step": 758 + }, + { + "epoch": 0.0475, + "grad_norm": 3.75, + "grad_norm_var": 0.13603413899739583, + "learning_rate": 0.0001, + "loss": 9.0298, + "loss/crossentropy": 2.4479427337646484, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3056575655937195, + "step": 760 + }, + { + "epoch": 0.047625, + "grad_norm": 3.75, + "grad_norm_var": 0.12724609375, + "learning_rate": 0.0001, + "loss": 9.2332, + "loss/crossentropy": 2.1675299406051636, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.303151935338974, + "step": 762 + }, + { + "epoch": 0.04775, + "grad_norm": 3.96875, + "grad_norm_var": 0.08153889973958334, + "learning_rate": 0.0001, + "loss": 9.0754, + "loss/crossentropy": 2.5868079662323, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2945869415998459, + "step": 764 + }, + { + "epoch": 0.047875, + "grad_norm": 3.609375, + "grad_norm_var": 0.07388916015625, + "learning_rate": 0.0001, + "loss": 9.141, + "loss/crossentropy": 2.535553216934204, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.32601119577884674, + "step": 766 + }, + { + "epoch": 0.048, + "grad_norm": 4.0625, + "grad_norm_var": 0.05016276041666667, + "learning_rate": 0.0001, + "loss": 9.2516, + "loss/crossentropy": 2.5762476921081543, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.34192636609077454, + "step": 768 + }, + { + "epoch": 0.048125, + "grad_norm": 3.921875, + "grad_norm_var": 0.0440582275390625, + "learning_rate": 0.0001, + "loss": 9.0968, + "loss/crossentropy": 2.402553081512451, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3043902814388275, + "step": 770 + }, + { + "epoch": 0.04825, + "grad_norm": 3.3125, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 0.0001, + "loss": 9.0553, + "loss/crossentropy": 2.4971920251846313, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.30807921290397644, + "step": 772 + }, + { + "epoch": 0.048375, + "grad_norm": 3.625, + "grad_norm_var": 0.043843587239583336, + "learning_rate": 0.0001, + "loss": 9.1797, + "loss/crossentropy": 2.2825552225112915, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.31863027811050415, + "step": 774 + }, + { + "epoch": 0.0485, + "grad_norm": 3.625, + "grad_norm_var": 0.04838765462239583, + "learning_rate": 0.0001, + "loss": 8.9698, + "loss/crossentropy": 2.516672134399414, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.31801968812942505, + "step": 776 + }, + { + "epoch": 0.048625, + "grad_norm": 4.125, + "grad_norm_var": 0.06825764973958333, + "learning_rate": 0.0001, + "loss": 8.9749, + "loss/crossentropy": 2.495086908340454, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.31983429193496704, + "step": 778 + }, + { + "epoch": 0.04875, + "grad_norm": 4.28125, + "grad_norm_var": 0.12460530598958333, + "learning_rate": 0.0001, + "loss": 9.1932, + "loss/crossentropy": 2.3892232179641724, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.3304327577352524, + "step": 780 + }, + { + "epoch": 0.048875, + "grad_norm": 3.3125, + "grad_norm_var": 0.17346598307291666, + "learning_rate": 0.0001, + "loss": 9.0205, + "loss/crossentropy": 2.4275970458984375, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.29281261563301086, + "step": 782 + }, + { + "epoch": 0.049, + "grad_norm": 3.90625, + "grad_norm_var": 0.16988525390625, + "learning_rate": 0.0001, + "loss": 9.0345, + "loss/crossentropy": 2.147684335708618, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.30130012333393097, + "step": 784 + }, + { + "epoch": 0.049125, + "grad_norm": 3.53125, + "grad_norm_var": 0.1740234375, + "learning_rate": 0.0001, + "loss": 9.1809, + "loss/crossentropy": 2.387849450111389, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.32018375396728516, + "step": 786 + }, + { + "epoch": 0.04925, + "grad_norm": 3.59375, + "grad_norm_var": 0.16553446451822917, + "learning_rate": 0.0001, + "loss": 8.9704, + "loss/crossentropy": 2.1901475191116333, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2830745130777359, + "step": 788 + }, + { + "epoch": 0.049375, + "grad_norm": 3.390625, + "grad_norm_var": 0.17476806640625, + "learning_rate": 0.0001, + "loss": 9.0056, + "loss/crossentropy": 2.476477026939392, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.30152270197868347, + "step": 790 + }, + { + "epoch": 0.0495, + "grad_norm": 3.75, + "grad_norm_var": 0.16788736979166666, + "learning_rate": 0.0001, + "loss": 9.2374, + "loss/crossentropy": 2.4895143508911133, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.32273176312446594, + "step": 792 + }, + { + "epoch": 0.049625, + "grad_norm": 3.390625, + "grad_norm_var": 0.16460673014322916, + "learning_rate": 0.0001, + "loss": 8.8151, + "loss/crossentropy": 2.1444047689437866, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.2969019412994385, + "step": 794 + }, + { + "epoch": 0.04975, + "grad_norm": 3.390625, + "grad_norm_var": 0.06581624348958333, + "learning_rate": 0.0001, + "loss": 9.0359, + "loss/crossentropy": 2.314146399497986, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.297823429107666, + "step": 796 + }, + { + "epoch": 0.049875, + "grad_norm": 3.484375, + "grad_norm_var": 0.046223958333333336, + "learning_rate": 0.0001, + "loss": 8.9617, + "loss/crossentropy": 2.1535879373550415, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.29098525643348694, + "step": 798 + }, + { + "epoch": 0.05, + "grad_norm": 3.59375, + "grad_norm_var": 0.04029032389322917, + "learning_rate": 0.0001, + "loss": 8.9443, + "loss/crossentropy": 2.5792254209518433, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.30909422039985657, + "step": 800 + }, + { + "epoch": 0.050125, + "grad_norm": 3.671875, + "grad_norm_var": 0.041258748372395834, + "learning_rate": 0.0001, + "loss": 8.9077, + "loss/crossentropy": 2.387328624725342, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.35100018978118896, + "step": 802 + }, + { + "epoch": 0.05025, + "grad_norm": 3.484375, + "grad_norm_var": 0.0410308837890625, + "learning_rate": 0.0001, + "loss": 8.9889, + "loss/crossentropy": 2.5987643003463745, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.2944849133491516, + "step": 804 + }, + { + "epoch": 0.050375, + "grad_norm": 3.46875, + "grad_norm_var": 0.03965555826822917, + "learning_rate": 0.0001, + "loss": 8.688, + "loss/crossentropy": 2.291478753089905, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.3056093603372574, + "step": 806 + }, + { + "epoch": 0.0505, + "grad_norm": 3.6875, + "grad_norm_var": 0.030744425455729165, + "learning_rate": 0.0001, + "loss": 8.8988, + "loss/crossentropy": 2.281537890434265, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31349045038223267, + "step": 808 + }, + { + "epoch": 0.050625, + "grad_norm": 3.546875, + "grad_norm_var": 0.026634724934895833, + "learning_rate": 0.0001, + "loss": 9.0642, + "loss/crossentropy": 2.16494482755661, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.307197168469429, + "step": 810 + }, + { + "epoch": 0.05075, + "grad_norm": 3.75, + "grad_norm_var": 0.019953409830729168, + "learning_rate": 0.0001, + "loss": 8.9202, + "loss/crossentropy": 2.296713352203369, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.30054841935634613, + "step": 812 + }, + { + "epoch": 0.050875, + "grad_norm": 3.640625, + "grad_norm_var": 0.021012369791666666, + "learning_rate": 0.0001, + "loss": 8.8666, + "loss/crossentropy": 2.0488401055336, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.2681543007493019, + "step": 814 + }, + { + "epoch": 0.051, + "grad_norm": 3.75, + "grad_norm_var": 0.02427978515625, + "learning_rate": 0.0001, + "loss": 8.9113, + "loss/crossentropy": 2.1317135095596313, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.2823093831539154, + "step": 816 + }, + { + "epoch": 0.051125, + "grad_norm": 3.640625, + "grad_norm_var": 0.03144124348958333, + "learning_rate": 0.0001, + "loss": 8.9404, + "loss/crossentropy": 2.3469539880752563, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.30824559926986694, + "step": 818 + }, + { + "epoch": 0.05125, + "grad_norm": 3.921875, + "grad_norm_var": 0.05701395670572917, + "learning_rate": 0.0001, + "loss": 9.108, + "loss/crossentropy": 2.4571563005447388, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31656327843666077, + "step": 820 + }, + { + "epoch": 0.051375, + "grad_norm": 3.65625, + "grad_norm_var": 0.053141276041666664, + "learning_rate": 0.0001, + "loss": 9.0717, + "loss/crossentropy": 2.3994951248168945, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3029633164405823, + "step": 822 + }, + { + "epoch": 0.0515, + "grad_norm": 3.6875, + "grad_norm_var": 0.051985677083333334, + "learning_rate": 0.0001, + "loss": 8.9063, + "loss/crossentropy": 2.2581586837768555, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.30481448769569397, + "step": 824 + }, + { + "epoch": 0.051625, + "grad_norm": 3.125, + "grad_norm_var": 0.07155659993489584, + "learning_rate": 0.0001, + "loss": 8.8898, + "loss/crossentropy": 2.071722984313965, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.2941209524869919, + "step": 826 + }, + { + "epoch": 0.05175, + "grad_norm": 3.625, + "grad_norm_var": 0.0892578125, + "learning_rate": 0.0001, + "loss": 9.0358, + "loss/crossentropy": 2.5443246364593506, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.29996325075626373, + "step": 828 + }, + { + "epoch": 0.051875, + "grad_norm": 3.40625, + "grad_norm_var": 0.09868062337239583, + "learning_rate": 0.0001, + "loss": 8.9568, + "loss/crossentropy": 2.384023070335388, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.31264999508857727, + "step": 830 + }, + { + "epoch": 0.052, + "grad_norm": 3.953125, + "grad_norm_var": 0.0974609375, + "learning_rate": 0.0001, + "loss": 8.9013, + "loss/crossentropy": 2.4286372661590576, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.32337459921836853, + "step": 832 + }, + { + "epoch": 0.052125, + "grad_norm": 3.71875, + "grad_norm_var": 0.08315327962239584, + "learning_rate": 0.0001, + "loss": 8.7277, + "loss/crossentropy": 2.1722983717918396, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3173587769269943, + "step": 834 + }, + { + "epoch": 0.05225, + "grad_norm": 4.03125, + "grad_norm_var": 0.07082417805989584, + "learning_rate": 0.0001, + "loss": 9.0322, + "loss/crossentropy": 2.6337625980377197, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.30448000133037567, + "step": 836 + }, + { + "epoch": 0.052375, + "grad_norm": 3.53125, + "grad_norm_var": 0.0727203369140625, + "learning_rate": 0.0001, + "loss": 8.9959, + "loss/crossentropy": 2.303470253944397, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.318126916885376, + "step": 838 + }, + { + "epoch": 0.0525, + "grad_norm": 3.734375, + "grad_norm_var": 0.0729888916015625, + "learning_rate": 0.0001, + "loss": 8.7451, + "loss/crossentropy": 2.295042037963867, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.30773746967315674, + "step": 840 + }, + { + "epoch": 0.052625, + "grad_norm": 3.0625, + "grad_norm_var": 0.0851226806640625, + "learning_rate": 0.0001, + "loss": 8.6922, + "loss/crossentropy": 2.060616612434387, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.29626762866973877, + "step": 842 + }, + { + "epoch": 0.05275, + "grad_norm": 3.671875, + "grad_norm_var": 0.070556640625, + "learning_rate": 0.0001, + "loss": 8.9668, + "loss/crossentropy": 2.2909332513809204, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2921592891216278, + "step": 844 + }, + { + "epoch": 0.052875, + "grad_norm": 4.21875, + "grad_norm_var": 0.092822265625, + "learning_rate": 0.0001, + "loss": 9.1657, + "loss/crossentropy": 2.5100677013397217, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3154117166996002, + "step": 846 + }, + { + "epoch": 0.053, + "grad_norm": 3.546875, + "grad_norm_var": 0.09158528645833333, + "learning_rate": 0.0001, + "loss": 9.1623, + "loss/crossentropy": 2.3510499000549316, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.28174377977848053, + "step": 848 + }, + { + "epoch": 0.053125, + "grad_norm": 3.5625, + "grad_norm_var": 0.08963216145833333, + "learning_rate": 0.0001, + "loss": 8.8411, + "loss/crossentropy": 2.361242413520813, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.2962343841791153, + "step": 850 + }, + { + "epoch": 0.05325, + "grad_norm": 3.40625, + "grad_norm_var": 0.08534749348958333, + "learning_rate": 0.0001, + "loss": 8.8426, + "loss/crossentropy": 1.9918023943901062, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.26788248866796494, + "step": 852 + }, + { + "epoch": 0.053375, + "grad_norm": 3.25, + "grad_norm_var": 0.0908111572265625, + "learning_rate": 0.0001, + "loss": 8.9666, + "loss/crossentropy": 2.6088958978652954, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.3181132972240448, + "step": 854 + }, + { + "epoch": 0.0535, + "grad_norm": 3.8125, + "grad_norm_var": 0.09516499837239584, + "learning_rate": 0.0001, + "loss": 8.8605, + "loss/crossentropy": 2.1732386350631714, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3017688989639282, + "step": 856 + }, + { + "epoch": 0.053625, + "grad_norm": 3.578125, + "grad_norm_var": 0.080029296875, + "learning_rate": 0.0001, + "loss": 9.0765, + "loss/crossentropy": 2.3053905963897705, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28416720032691956, + "step": 858 + }, + { + "epoch": 0.05375, + "grad_norm": 3.53125, + "grad_norm_var": 0.08489481608072917, + "learning_rate": 0.0001, + "loss": 8.9506, + "loss/crossentropy": 2.3922606706619263, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3041910231113434, + "step": 860 + }, + { + "epoch": 0.053875, + "grad_norm": 3.625, + "grad_norm_var": 0.059326171875, + "learning_rate": 0.0001, + "loss": 8.9744, + "loss/crossentropy": 2.2312803864479065, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.28423628211021423, + "step": 862 + }, + { + "epoch": 0.054, + "grad_norm": 3.5625, + "grad_norm_var": 0.053938802083333334, + "learning_rate": 0.0001, + "loss": 8.8504, + "loss/crossentropy": 2.4400585889816284, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.32312142848968506, + "step": 864 + }, + { + "epoch": 0.054125, + "grad_norm": 3.453125, + "grad_norm_var": 0.06122639973958333, + "learning_rate": 0.0001, + "loss": 8.7439, + "loss/crossentropy": 2.16322124004364, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.3096587359905243, + "step": 866 + }, + { + "epoch": 0.05425, + "grad_norm": 3.59375, + "grad_norm_var": 0.05967508951822917, + "learning_rate": 0.0001, + "loss": 8.7824, + "loss/crossentropy": 2.2714940309524536, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.31225262582302094, + "step": 868 + }, + { + "epoch": 0.054375, + "grad_norm": 3.59375, + "grad_norm_var": 0.061335245768229164, + "learning_rate": 0.0001, + "loss": 8.9188, + "loss/crossentropy": 2.3233022689819336, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.29183535277843475, + "step": 870 + }, + { + "epoch": 0.0545, + "grad_norm": 3.46875, + "grad_norm_var": 0.060628255208333336, + "learning_rate": 0.0001, + "loss": 8.8018, + "loss/crossentropy": 2.1473275423049927, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.302778959274292, + "step": 872 + }, + { + "epoch": 0.054625, + "grad_norm": 3.5, + "grad_norm_var": 0.04723307291666667, + "learning_rate": 0.0001, + "loss": 8.7291, + "loss/crossentropy": 2.10613477230072, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2919985055923462, + "step": 874 + }, + { + "epoch": 0.05475, + "grad_norm": 3.828125, + "grad_norm_var": 0.04791259765625, + "learning_rate": 0.0001, + "loss": 8.8805, + "loss/crossentropy": 2.391135811805725, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.303908035159111, + "step": 876 + }, + { + "epoch": 0.054875, + "grad_norm": 4.21875, + "grad_norm_var": 0.07213134765625, + "learning_rate": 0.0001, + "loss": 8.9757, + "loss/crossentropy": 2.191763758659363, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.29602205753326416, + "step": 878 + }, + { + "epoch": 0.055, + "grad_norm": 3.09375, + "grad_norm_var": 0.0880859375, + "learning_rate": 0.0001, + "loss": 8.9226, + "loss/crossentropy": 2.3897584676742554, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.2984778434038162, + "step": 880 + }, + { + "epoch": 0.055125, + "grad_norm": 3.53125, + "grad_norm_var": 0.10453999837239583, + "learning_rate": 0.0001, + "loss": 8.6909, + "loss/crossentropy": 2.2274895906448364, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.3162970691919327, + "step": 882 + }, + { + "epoch": 0.05525, + "grad_norm": 3.5, + "grad_norm_var": 0.10286051432291667, + "learning_rate": 0.0001, + "loss": 8.7805, + "loss/crossentropy": 2.238261342048645, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.3091089427471161, + "step": 884 + }, + { + "epoch": 0.055375, + "grad_norm": 3.578125, + "grad_norm_var": 0.09251200358072917, + "learning_rate": 0.0001, + "loss": 8.8335, + "loss/crossentropy": 2.398587703704834, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.30020007491111755, + "step": 886 + }, + { + "epoch": 0.0555, + "grad_norm": 3.125, + "grad_norm_var": 0.11856180826822917, + "learning_rate": 0.0001, + "loss": 8.6695, + "loss/crossentropy": 2.362698554992676, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3049694448709488, + "step": 888 + }, + { + "epoch": 0.055625, + "grad_norm": 4.09375, + "grad_norm_var": 0.13857014973958334, + "learning_rate": 0.0001, + "loss": 8.8959, + "loss/crossentropy": 2.483735680580139, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.3071689158678055, + "step": 890 + }, + { + "epoch": 0.05575, + "grad_norm": 3.734375, + "grad_norm_var": 0.1455230712890625, + "learning_rate": 0.0001, + "loss": 9.1407, + "loss/crossentropy": 2.6666314601898193, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3311910331249237, + "step": 892 + }, + { + "epoch": 0.055875, + "grad_norm": 3.703125, + "grad_norm_var": 0.1199615478515625, + "learning_rate": 0.0001, + "loss": 8.8907, + "loss/crossentropy": 2.3543641567230225, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.30659469962120056, + "step": 894 + }, + { + "epoch": 0.056, + "grad_norm": 3.546875, + "grad_norm_var": 0.11165364583333333, + "learning_rate": 0.0001, + "loss": 8.542, + "loss/crossentropy": 2.442333459854126, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.2963842451572418, + "step": 896 + }, + { + "epoch": 0.056125, + "grad_norm": 3.25, + "grad_norm_var": 0.09463602701822917, + "learning_rate": 0.0001, + "loss": 8.8421, + "loss/crossentropy": 2.5169384479522705, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.31081072986125946, + "step": 898 + }, + { + "epoch": 0.05625, + "grad_norm": 3.609375, + "grad_norm_var": 0.09204813639322916, + "learning_rate": 0.0001, + "loss": 8.8349, + "loss/crossentropy": 2.378996729850769, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.2988849878311157, + "step": 900 + }, + { + "epoch": 0.056375, + "grad_norm": 4.0, + "grad_norm_var": 0.10779520670572916, + "learning_rate": 0.0001, + "loss": 8.6611, + "loss/crossentropy": 2.0278642177581787, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.26663029193878174, + "step": 902 + }, + { + "epoch": 0.0565, + "grad_norm": 3.625, + "grad_norm_var": 0.08515218098958334, + "learning_rate": 0.0001, + "loss": 8.8614, + "loss/crossentropy": 2.3273751735687256, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.2889983803033829, + "step": 904 + }, + { + "epoch": 0.056625, + "grad_norm": 3.46875, + "grad_norm_var": 0.07197265625, + "learning_rate": 0.0001, + "loss": 8.8386, + "loss/crossentropy": 2.0480875372886658, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.30192601680755615, + "step": 906 + }, + { + "epoch": 0.05675, + "grad_norm": 3.859375, + "grad_norm_var": 0.05976155598958333, + "learning_rate": 0.0001, + "loss": 9.0464, + "loss/crossentropy": 2.5360888242721558, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.28871724009513855, + "step": 908 + }, + { + "epoch": 0.056875, + "grad_norm": 3.25, + "grad_norm_var": 0.060846964518229164, + "learning_rate": 0.0001, + "loss": 8.9657, + "loss/crossentropy": 2.59428608417511, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.3197096735239029, + "step": 910 + }, + { + "epoch": 0.057, + "grad_norm": 3.46875, + "grad_norm_var": 0.062027994791666666, + "learning_rate": 0.0001, + "loss": 8.8488, + "loss/crossentropy": 2.4978867769241333, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.38725124299526215, + "step": 912 + }, + { + "epoch": 0.057125, + "grad_norm": 3.640625, + "grad_norm_var": 0.060347493489583334, + "learning_rate": 0.0001, + "loss": 9.0475, + "loss/crossentropy": 2.269066333770752, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.2860633432865143, + "step": 914 + }, + { + "epoch": 0.05725, + "grad_norm": 3.03125, + "grad_norm_var": 0.07659505208333334, + "learning_rate": 0.0001, + "loss": 8.6945, + "loss/crossentropy": 2.3426260948181152, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.2847317010164261, + "step": 916 + }, + { + "epoch": 0.057375, + "grad_norm": 3.53125, + "grad_norm_var": 0.055562337239583336, + "learning_rate": 0.0001, + "loss": 8.7413, + "loss/crossentropy": 2.3890886306762695, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2951260507106781, + "step": 918 + }, + { + "epoch": 0.0575, + "grad_norm": 3.46875, + "grad_norm_var": 0.05225321451822917, + "learning_rate": 0.0001, + "loss": 8.726, + "loss/crossentropy": 2.324913740158081, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2819140702486038, + "step": 920 + }, + { + "epoch": 0.057625, + "grad_norm": 4.375, + "grad_norm_var": 0.10100809733072917, + "learning_rate": 0.0001, + "loss": 8.8514, + "loss/crossentropy": 2.4282515048980713, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.3256274312734604, + "step": 922 + }, + { + "epoch": 0.05775, + "grad_norm": 3.46875, + "grad_norm_var": 0.10305582682291667, + "learning_rate": 0.0001, + "loss": 8.6487, + "loss/crossentropy": 2.476005434989929, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.3199878931045532, + "step": 924 + }, + { + "epoch": 0.057875, + "grad_norm": 3.71875, + "grad_norm_var": 0.13284098307291667, + "learning_rate": 0.0001, + "loss": 8.8988, + "loss/crossentropy": 2.469366192817688, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.2891136407852173, + "step": 926 + }, + { + "epoch": 0.058, + "grad_norm": 3.234375, + "grad_norm_var": 0.14055989583333334, + "learning_rate": 0.0001, + "loss": 8.7523, + "loss/crossentropy": 2.348281979560852, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.27692942321300507, + "step": 928 + }, + { + "epoch": 0.058125, + "grad_norm": 3.515625, + "grad_norm_var": 0.14073893229166667, + "learning_rate": 0.0001, + "loss": 8.5853, + "loss/crossentropy": 2.1680904626846313, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.29489417374134064, + "step": 930 + }, + { + "epoch": 0.05825, + "grad_norm": 3.65625, + "grad_norm_var": 0.116650390625, + "learning_rate": 0.0001, + "loss": 8.7008, + "loss/crossentropy": 2.139521837234497, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.263563334941864, + "step": 932 + }, + { + "epoch": 0.058375, + "grad_norm": 3.265625, + "grad_norm_var": 0.13267822265625, + "learning_rate": 0.0001, + "loss": 8.8043, + "loss/crossentropy": 2.522794246673584, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3044002056121826, + "step": 934 + }, + { + "epoch": 0.0585, + "grad_norm": 3.765625, + "grad_norm_var": 0.12968343098958332, + "learning_rate": 0.0001, + "loss": 8.9035, + "loss/crossentropy": 2.4657033681869507, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.30104976892471313, + "step": 936 + }, + { + "epoch": 0.058625, + "grad_norm": 5.53125, + "grad_norm_var": 0.3282389322916667, + "learning_rate": 0.0001, + "loss": 8.9486, + "loss/crossentropy": 2.3463146686553955, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.303235799074173, + "step": 938 + }, + { + "epoch": 0.05875, + "grad_norm": 3.515625, + "grad_norm_var": 0.3243479410807292, + "learning_rate": 0.0001, + "loss": 8.9808, + "loss/crossentropy": 2.2249823808670044, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.29463791847229004, + "step": 940 + }, + { + "epoch": 0.058875, + "grad_norm": 3.25, + "grad_norm_var": 0.30130106608072915, + "learning_rate": 0.0001, + "loss": 8.67, + "loss/crossentropy": 2.334540367126465, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.30720797181129456, + "step": 942 + }, + { + "epoch": 0.059, + "grad_norm": 3.921875, + "grad_norm_var": 0.2997029622395833, + "learning_rate": 0.0001, + "loss": 8.7021, + "loss/crossentropy": 2.498934745788574, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.2946365475654602, + "step": 944 + }, + { + "epoch": 0.059125, + "grad_norm": 3.71875, + "grad_norm_var": 0.2964508056640625, + "learning_rate": 0.0001, + "loss": 8.7885, + "loss/crossentropy": 2.0534738898277283, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.27286672592163086, + "step": 946 + }, + { + "epoch": 0.05925, + "grad_norm": 3.125, + "grad_norm_var": 0.3176066080729167, + "learning_rate": 0.0001, + "loss": 8.8829, + "loss/crossentropy": 2.362457275390625, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.2855580151081085, + "step": 948 + }, + { + "epoch": 0.059375, + "grad_norm": 3.6875, + "grad_norm_var": 0.30732320149739584, + "learning_rate": 0.0001, + "loss": 8.7018, + "loss/crossentropy": 2.1891767382621765, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.31324711441993713, + "step": 950 + }, + { + "epoch": 0.0595, + "grad_norm": 3.734375, + "grad_norm_var": 0.31344401041666664, + "learning_rate": 0.0001, + "loss": 8.8816, + "loss/crossentropy": 2.3532931804656982, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.30756065249443054, + "step": 952 + }, + { + "epoch": 0.059625, + "grad_norm": 3.265625, + "grad_norm_var": 0.06813863118489584, + "learning_rate": 0.0001, + "loss": 8.7884, + "loss/crossentropy": 2.365698456764221, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3289715647697449, + "step": 954 + }, + { + "epoch": 0.05975, + "grad_norm": 3.671875, + "grad_norm_var": 0.0709381103515625, + "learning_rate": 0.0001, + "loss": 8.9138, + "loss/crossentropy": 2.614629626274109, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.30402082204818726, + "step": 956 + }, + { + "epoch": 0.059875, + "grad_norm": 3.59375, + "grad_norm_var": 0.07691650390625, + "learning_rate": 0.0001, + "loss": 8.663, + "loss/crossentropy": 2.109455645084381, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.2772120535373688, + "step": 958 + }, + { + "epoch": 0.06, + "grad_norm": 3.765625, + "grad_norm_var": 0.0668121337890625, + "learning_rate": 0.0001, + "loss": 8.8741, + "loss/crossentropy": 2.380413055419922, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.3016493618488312, + "step": 960 + }, + { + "epoch": 0.060125, + "grad_norm": 3.5, + "grad_norm_var": 0.06373291015625, + "learning_rate": 0.0001, + "loss": 8.9362, + "loss/crossentropy": 2.506435751914978, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.335014671087265, + "step": 962 + }, + { + "epoch": 0.06025, + "grad_norm": 3.40625, + "grad_norm_var": 0.0563140869140625, + "learning_rate": 0.0001, + "loss": 8.6434, + "loss/crossentropy": 1.9750906229019165, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2565095126628876, + "step": 964 + }, + { + "epoch": 0.060375, + "grad_norm": 3.75, + "grad_norm_var": 0.05491434733072917, + "learning_rate": 0.0001, + "loss": 8.7565, + "loss/crossentropy": 2.2668718099594116, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.2805168777704239, + "step": 966 + }, + { + "epoch": 0.0605, + "grad_norm": 3.21875, + "grad_norm_var": 0.044266764322916666, + "learning_rate": 0.0001, + "loss": 8.718, + "loss/crossentropy": 2.2082256078720093, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.2930946350097656, + "step": 968 + }, + { + "epoch": 0.060625, + "grad_norm": 3.84375, + "grad_norm_var": 0.0510894775390625, + "learning_rate": 0.0001, + "loss": 8.8922, + "loss/crossentropy": 2.4699355363845825, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.31211861968040466, + "step": 970 + }, + { + "epoch": 0.06075, + "grad_norm": 3.609375, + "grad_norm_var": 0.07473856608072917, + "learning_rate": 0.0001, + "loss": 8.6174, + "loss/crossentropy": 2.160146117210388, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.28759919106960297, + "step": 972 + }, + { + "epoch": 0.060875, + "grad_norm": 3.09375, + "grad_norm_var": 0.07998758951822917, + "learning_rate": 0.0001, + "loss": 8.9576, + "loss/crossentropy": 2.245842933654785, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.28311386704444885, + "step": 974 + }, + { + "epoch": 0.061, + "grad_norm": 3.65625, + "grad_norm_var": 0.07842508951822917, + "learning_rate": 0.0001, + "loss": 8.7692, + "loss/crossentropy": 2.3998042345046997, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.29600852727890015, + "step": 976 + }, + { + "epoch": 0.061125, + "grad_norm": 3.53125, + "grad_norm_var": 0.07856343587239584, + "learning_rate": 0.0001, + "loss": 8.7165, + "loss/crossentropy": 2.4230899810791016, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.2921423017978668, + "step": 978 + }, + { + "epoch": 0.06125, + "grad_norm": 3.203125, + "grad_norm_var": 0.08714090983072917, + "learning_rate": 0.0001, + "loss": 8.5009, + "loss/crossentropy": 2.105591118335724, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.2892334759235382, + "step": 980 + }, + { + "epoch": 0.061375, + "grad_norm": 3.671875, + "grad_norm_var": 0.08603515625, + "learning_rate": 0.0001, + "loss": 8.8291, + "loss/crossentropy": 2.342753052711487, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2922486811876297, + "step": 982 + }, + { + "epoch": 0.0615, + "grad_norm": 3.046875, + "grad_norm_var": 0.09026285807291666, + "learning_rate": 0.0001, + "loss": 8.7629, + "loss/crossentropy": 2.5545257329940796, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.29155534505844116, + "step": 984 + }, + { + "epoch": 0.061625, + "grad_norm": 3.53125, + "grad_norm_var": 0.08010152180989584, + "learning_rate": 0.0001, + "loss": 8.635, + "loss/crossentropy": 2.390330672264099, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.28914259374141693, + "step": 986 + }, + { + "epoch": 0.06175, + "grad_norm": 3.109375, + "grad_norm_var": 0.039484659830729164, + "learning_rate": 0.0001, + "loss": 8.6355, + "loss/crossentropy": 2.031624495983124, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.29533734917640686, + "step": 988 + }, + { + "epoch": 0.061875, + "grad_norm": 3.625, + "grad_norm_var": 0.04781494140625, + "learning_rate": 0.0001, + "loss": 8.8029, + "loss/crossentropy": 2.1612058877944946, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.2922551929950714, + "step": 990 + }, + { + "epoch": 0.062, + "grad_norm": 3.15625, + "grad_norm_var": 0.06575520833333333, + "learning_rate": 0.0001, + "loss": 8.8484, + "loss/crossentropy": 2.2686573266983032, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.28837865591049194, + "step": 992 + }, + { + "epoch": 0.062125, + "grad_norm": 3.953125, + "grad_norm_var": 0.08382161458333333, + "learning_rate": 0.0001, + "loss": 8.9572, + "loss/crossentropy": 2.4839664697647095, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.30035223066806793, + "step": 994 + }, + { + "epoch": 0.06225, + "grad_norm": 3.328125, + "grad_norm_var": 0.0792388916015625, + "learning_rate": 0.0001, + "loss": 8.7334, + "loss/crossentropy": 2.434647560119629, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.30409903824329376, + "step": 996 + }, + { + "epoch": 0.062375, + "grad_norm": 4.09375, + "grad_norm_var": 0.10009663899739583, + "learning_rate": 0.0001, + "loss": 8.7896, + "loss/crossentropy": 2.349792718887329, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.273041769862175, + "step": 998 + }, + { + "epoch": 0.0625, + "grad_norm": 2.875, + "grad_norm_var": 0.1135162353515625, + "learning_rate": 0.0001, + "loss": 8.7517, + "loss/crossentropy": 2.424883484840393, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.28756730258464813, + "step": 1000 + }, + { + "epoch": 0.062625, + "grad_norm": 3.1875, + "grad_norm_var": 0.1218414306640625, + "learning_rate": 0.0001, + "loss": 8.5904, + "loss/crossentropy": 2.4634835720062256, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.28353893756866455, + "step": 1002 + }, + { + "epoch": 0.06275, + "grad_norm": 3.421875, + "grad_norm_var": 0.10761311848958334, + "learning_rate": 0.0001, + "loss": 8.6432, + "loss/crossentropy": 2.3544020652770996, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.291437104344368, + "step": 1004 + }, + { + "epoch": 0.062875, + "grad_norm": 3.203125, + "grad_norm_var": 0.11253255208333333, + "learning_rate": 0.0001, + "loss": 8.6969, + "loss/crossentropy": 2.3183122873306274, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.291623592376709, + "step": 1006 + }, + { + "epoch": 0.063, + "grad_norm": 3.078125, + "grad_norm_var": 0.10125325520833334, + "learning_rate": 0.0001, + "loss": 8.7425, + "loss/crossentropy": 2.5724557638168335, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3042745739221573, + "step": 1008 + }, + { + "epoch": 0.063125, + "grad_norm": 3.484375, + "grad_norm_var": 0.08701171875, + "learning_rate": 0.0001, + "loss": 8.7429, + "loss/crossentropy": 2.393343210220337, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.28968481719493866, + "step": 1010 + }, + { + "epoch": 0.06325, + "grad_norm": 3.359375, + "grad_norm_var": 0.0902740478515625, + "learning_rate": 0.0001, + "loss": 8.7322, + "loss/crossentropy": 2.4881935119628906, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.3185275048017502, + "step": 1012 + }, + { + "epoch": 0.063375, + "grad_norm": 3.3125, + "grad_norm_var": 0.05918680826822917, + "learning_rate": 0.0001, + "loss": 8.6637, + "loss/crossentropy": 2.1747263073921204, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.27498696744441986, + "step": 1014 + }, + { + "epoch": 0.0635, + "grad_norm": 3.734375, + "grad_norm_var": 0.09000651041666667, + "learning_rate": 0.0001, + "loss": 8.6898, + "loss/crossentropy": 2.5582990646362305, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.3123309761285782, + "step": 1016 + }, + { + "epoch": 0.063625, + "grad_norm": 2.96875, + "grad_norm_var": 0.10186258951822917, + "learning_rate": 0.0001, + "loss": 8.4983, + "loss/crossentropy": 2.2065166234970093, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.2910768985748291, + "step": 1018 + }, + { + "epoch": 0.06375, + "grad_norm": 3.171875, + "grad_norm_var": 0.10444234212239584, + "learning_rate": 0.0001, + "loss": 8.5609, + "loss/crossentropy": 2.3102041482925415, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.2962404191493988, + "step": 1020 + }, + { + "epoch": 0.063875, + "grad_norm": 3.171875, + "grad_norm_var": 0.10598042805989584, + "learning_rate": 0.0001, + "loss": 8.619, + "loss/crossentropy": 2.330891489982605, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.301542192697525, + "step": 1022 + }, + { + "epoch": 0.064, + "grad_norm": 3.390625, + "grad_norm_var": 0.10001627604166667, + "learning_rate": 0.0001, + "loss": 8.4584, + "loss/crossentropy": 2.4117361307144165, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.2964586764574051, + "step": 1024 + }, + { + "epoch": 0.064125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0966796875, + "learning_rate": 0.0001, + "loss": 8.5442, + "loss/crossentropy": 2.069741904735565, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2646178603172302, + "step": 1026 + }, + { + "epoch": 0.06425, + "grad_norm": 3.359375, + "grad_norm_var": 0.09484049479166666, + "learning_rate": 0.0001, + "loss": 8.7027, + "loss/crossentropy": 2.4950772523880005, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.30084407329559326, + "step": 1028 + }, + { + "epoch": 0.064375, + "grad_norm": 3.46875, + "grad_norm_var": 0.096923828125, + "learning_rate": 0.0001, + "loss": 8.6711, + "loss/crossentropy": 2.3811144828796387, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.2816159278154373, + "step": 1030 + }, + { + "epoch": 0.0645, + "grad_norm": 3.265625, + "grad_norm_var": 0.03141988118489583, + "learning_rate": 0.0001, + "loss": 8.5786, + "loss/crossentropy": 2.146743893623352, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2715151458978653, + "step": 1032 + }, + { + "epoch": 0.064625, + "grad_norm": 3.53125, + "grad_norm_var": 0.0274566650390625, + "learning_rate": 0.0001, + "loss": 8.6314, + "loss/crossentropy": 2.453763008117676, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.28644131124019623, + "step": 1034 + }, + { + "epoch": 0.06475, + "grad_norm": 3.375, + "grad_norm_var": 0.024583943684895835, + "learning_rate": 0.0001, + "loss": 8.4581, + "loss/crossentropy": 2.189074158668518, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.28300249576568604, + "step": 1036 + }, + { + "epoch": 0.064875, + "grad_norm": 3.0625, + "grad_norm_var": 0.038102213541666666, + "learning_rate": 0.0001, + "loss": 8.5097, + "loss/crossentropy": 2.2738513946533203, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.27797406911849976, + "step": 1038 + }, + { + "epoch": 0.065, + "grad_norm": 3.3125, + "grad_norm_var": 0.0379547119140625, + "learning_rate": 0.0001, + "loss": 8.7724, + "loss/crossentropy": 2.5585055351257324, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3508901298046112, + "step": 1040 + }, + { + "epoch": 0.065125, + "grad_norm": 3.640625, + "grad_norm_var": 0.04047749837239583, + "learning_rate": 0.0001, + "loss": 8.8154, + "loss/crossentropy": 2.3115618228912354, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2781260311603546, + "step": 1042 + }, + { + "epoch": 0.06525, + "grad_norm": 3.25, + "grad_norm_var": 0.0395660400390625, + "learning_rate": 0.0001, + "loss": 8.5646, + "loss/crossentropy": 2.2015340328216553, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.29191769659519196, + "step": 1044 + }, + { + "epoch": 0.065375, + "grad_norm": 3.640625, + "grad_norm_var": 0.04152730305989583, + "learning_rate": 0.0001, + "loss": 8.7064, + "loss/crossentropy": 2.4439064264297485, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.31912754476070404, + "step": 1046 + }, + { + "epoch": 0.0655, + "grad_norm": 3.390625, + "grad_norm_var": 0.0404937744140625, + "learning_rate": 0.0001, + "loss": 8.7495, + "loss/crossentropy": 2.611847996711731, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.31853775680065155, + "step": 1048 + }, + { + "epoch": 0.065625, + "grad_norm": 3.75, + "grad_norm_var": 0.04395243326822917, + "learning_rate": 0.0001, + "loss": 8.5285, + "loss/crossentropy": 2.058500051498413, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2707058787345886, + "step": 1050 + }, + { + "epoch": 0.06575, + "grad_norm": 3.453125, + "grad_norm_var": 0.043355305989583336, + "learning_rate": 0.0001, + "loss": 8.6134, + "loss/crossentropy": 2.3460679054260254, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.29806579649448395, + "step": 1052 + }, + { + "epoch": 0.065875, + "grad_norm": 3.359375, + "grad_norm_var": 0.028531901041666665, + "learning_rate": 0.0001, + "loss": 8.7906, + "loss/crossentropy": 2.575559377670288, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.29479941725730896, + "step": 1054 + }, + { + "epoch": 0.066, + "grad_norm": 3.5, + "grad_norm_var": 0.03673502604166667, + "learning_rate": 0.0001, + "loss": 8.573, + "loss/crossentropy": 2.4485961198806763, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.3214751183986664, + "step": 1056 + }, + { + "epoch": 0.066125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0400390625, + "learning_rate": 0.0001, + "loss": 8.5355, + "loss/crossentropy": 2.397768259048462, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.31295061111450195, + "step": 1058 + }, + { + "epoch": 0.06625, + "grad_norm": 3.21875, + "grad_norm_var": 0.040848795572916666, + "learning_rate": 0.0001, + "loss": 8.4975, + "loss/crossentropy": 2.390447735786438, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.29537099599838257, + "step": 1060 + }, + { + "epoch": 0.066375, + "grad_norm": 3.046875, + "grad_norm_var": 0.04865620930989583, + "learning_rate": 0.0001, + "loss": 8.553, + "loss/crossentropy": 2.417343854904175, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.28230586647987366, + "step": 1062 + }, + { + "epoch": 0.0665, + "grad_norm": 3.390625, + "grad_norm_var": 0.046507771809895834, + "learning_rate": 0.0001, + "loss": 8.5829, + "loss/crossentropy": 2.6324515342712402, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.3016352653503418, + "step": 1064 + }, + { + "epoch": 0.066625, + "grad_norm": 3.453125, + "grad_norm_var": 0.03913472493489583, + "learning_rate": 0.0001, + "loss": 8.5346, + "loss/crossentropy": 2.323632597923279, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2779388278722763, + "step": 1066 + }, + { + "epoch": 0.06675, + "grad_norm": 2.953125, + "grad_norm_var": 0.03640950520833333, + "learning_rate": 0.0001, + "loss": 8.3832, + "loss/crossentropy": 2.393522083759308, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.29860424995422363, + "step": 1068 + }, + { + "epoch": 0.066875, + "grad_norm": 3.4375, + "grad_norm_var": 0.039839680989583334, + "learning_rate": 0.0001, + "loss": 8.6174, + "loss/crossentropy": 2.263484001159668, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.2802084982395172, + "step": 1070 + }, + { + "epoch": 0.067, + "grad_norm": 3.296875, + "grad_norm_var": 0.03242085774739583, + "learning_rate": 0.0001, + "loss": 8.4765, + "loss/crossentropy": 2.1152660846710205, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2554662525653839, + "step": 1072 + }, + { + "epoch": 0.067125, + "grad_norm": 3.171875, + "grad_norm_var": 0.03629150390625, + "learning_rate": 0.0001, + "loss": 8.5434, + "loss/crossentropy": 2.4824490547180176, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.28639310598373413, + "step": 1074 + }, + { + "epoch": 0.06725, + "grad_norm": 3.0, + "grad_norm_var": 0.033014933268229164, + "learning_rate": 0.0001, + "loss": 8.6253, + "loss/crossentropy": 2.5830127000808716, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.3099432587623596, + "step": 1076 + }, + { + "epoch": 0.067375, + "grad_norm": 3.96875, + "grad_norm_var": 0.0666168212890625, + "learning_rate": 0.0001, + "loss": 8.7433, + "loss/crossentropy": 2.580743908882141, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2936056852340698, + "step": 1078 + }, + { + "epoch": 0.0675, + "grad_norm": 3.21875, + "grad_norm_var": 0.07141011555989583, + "learning_rate": 0.0001, + "loss": 8.6721, + "loss/crossentropy": 2.350903868675232, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2968801259994507, + "step": 1080 + }, + { + "epoch": 0.067625, + "grad_norm": 3.5, + "grad_norm_var": 0.07302144368489584, + "learning_rate": 0.0001, + "loss": 8.4574, + "loss/crossentropy": 2.257428526878357, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2766515016555786, + "step": 1082 + }, + { + "epoch": 0.06775, + "grad_norm": 3.265625, + "grad_norm_var": 0.06972249348958333, + "learning_rate": 0.0001, + "loss": 8.6877, + "loss/crossentropy": 2.2526296377182007, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.28292417526245117, + "step": 1084 + }, + { + "epoch": 0.067875, + "grad_norm": 3.5, + "grad_norm_var": 0.06391499837239584, + "learning_rate": 0.0001, + "loss": 8.7502, + "loss/crossentropy": 2.3044220209121704, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2810298800468445, + "step": 1086 + }, + { + "epoch": 0.068, + "grad_norm": 3.125, + "grad_norm_var": 0.06946614583333334, + "learning_rate": 0.0001, + "loss": 8.585, + "loss/crossentropy": 2.2871402502059937, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.27473995089530945, + "step": 1088 + }, + { + "epoch": 0.068125, + "grad_norm": 3.890625, + "grad_norm_var": 0.08155008951822916, + "learning_rate": 0.0001, + "loss": 8.7181, + "loss/crossentropy": 2.49469530582428, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.2961750328540802, + "step": 1090 + }, + { + "epoch": 0.06825, + "grad_norm": 3.15625, + "grad_norm_var": 0.07158915201822917, + "learning_rate": 0.0001, + "loss": 8.6174, + "loss/crossentropy": 1.980285882949829, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.27205583453178406, + "step": 1092 + }, + { + "epoch": 0.068375, + "grad_norm": 3.65625, + "grad_norm_var": 0.06383056640625, + "learning_rate": 0.0001, + "loss": 8.383, + "loss/crossentropy": 2.4063356518745422, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.2908872812986374, + "step": 1094 + }, + { + "epoch": 0.0685, + "grad_norm": 3.34375, + "grad_norm_var": 0.05968424479166667, + "learning_rate": 0.0001, + "loss": 8.6962, + "loss/crossentropy": 2.3776599168777466, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.2903301566839218, + "step": 1096 + }, + { + "epoch": 0.068625, + "grad_norm": 3.515625, + "grad_norm_var": 0.05542704264322917, + "learning_rate": 0.0001, + "loss": 8.4252, + "loss/crossentropy": 2.25793194770813, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.2622709423303604, + "step": 1098 + }, + { + "epoch": 0.06875, + "grad_norm": 3.0625, + "grad_norm_var": 0.068017578125, + "learning_rate": 0.0001, + "loss": 8.4571, + "loss/crossentropy": 2.0190887451171875, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.25865359604358673, + "step": 1100 + }, + { + "epoch": 0.068875, + "grad_norm": 3.65625, + "grad_norm_var": 0.15084228515625, + "learning_rate": 0.0001, + "loss": 8.7145, + "loss/crossentropy": 2.419832944869995, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.36829979717731476, + "step": 1102 + }, + { + "epoch": 0.069, + "grad_norm": 3.203125, + "grad_norm_var": 0.18118489583333333, + "learning_rate": 0.0001, + "loss": 8.8892, + "loss/crossentropy": 2.3291234970092773, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.27904945611953735, + "step": 1104 + }, + { + "epoch": 0.069125, + "grad_norm": 3.75, + "grad_norm_var": 0.18000386555989584, + "learning_rate": 0.0001, + "loss": 8.7385, + "loss/crossentropy": 2.3887641429901123, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.3103363811969757, + "step": 1106 + }, + { + "epoch": 0.06925, + "grad_norm": 3.171875, + "grad_norm_var": 0.1803863525390625, + "learning_rate": 0.0001, + "loss": 8.4626, + "loss/crossentropy": 2.2511096000671387, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2744671106338501, + "step": 1108 + }, + { + "epoch": 0.069375, + "grad_norm": 3.453125, + "grad_norm_var": 0.16078999837239583, + "learning_rate": 0.0001, + "loss": 8.6296, + "loss/crossentropy": 2.251457929611206, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.2970822751522064, + "step": 1110 + }, + { + "epoch": 0.0695, + "grad_norm": 3.171875, + "grad_norm_var": 0.18325907389322918, + "learning_rate": 0.0001, + "loss": 8.4855, + "loss/crossentropy": 2.3782349824905396, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.30054476857185364, + "step": 1112 + }, + { + "epoch": 0.069625, + "grad_norm": 3.375, + "grad_norm_var": 0.18281962076822916, + "learning_rate": 0.0001, + "loss": 8.6186, + "loss/crossentropy": 2.605000376701355, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.298698827624321, + "step": 1114 + }, + { + "epoch": 0.06975, + "grad_norm": 3.34375, + "grad_norm_var": 0.15660807291666667, + "learning_rate": 0.0001, + "loss": 8.5642, + "loss/crossentropy": 2.2239125967025757, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.284654900431633, + "step": 1116 + }, + { + "epoch": 0.069875, + "grad_norm": 3.3125, + "grad_norm_var": 0.09190165201822917, + "learning_rate": 0.0001, + "loss": 8.6873, + "loss/crossentropy": 2.3841261863708496, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.2972448319196701, + "step": 1118 + }, + { + "epoch": 0.07, + "grad_norm": 3.53125, + "grad_norm_var": 0.04534098307291667, + "learning_rate": 0.0001, + "loss": 8.4881, + "loss/crossentropy": 2.251163959503174, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2940330058336258, + "step": 1120 + }, + { + "epoch": 0.070125, + "grad_norm": 3.640625, + "grad_norm_var": 0.041552734375, + "learning_rate": 0.0001, + "loss": 8.6624, + "loss/crossentropy": 2.390018582344055, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.31052152812480927, + "step": 1122 + }, + { + "epoch": 0.07025, + "grad_norm": 2.875, + "grad_norm_var": 0.06489156087239584, + "learning_rate": 0.0001, + "loss": 8.5129, + "loss/crossentropy": 2.2108170986175537, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2933100759983063, + "step": 1124 + }, + { + "epoch": 0.070375, + "grad_norm": 3.25, + "grad_norm_var": 0.06334635416666666, + "learning_rate": 0.0001, + "loss": 8.4376, + "loss/crossentropy": 2.2771471738815308, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.28437741100788116, + "step": 1126 + }, + { + "epoch": 0.0705, + "grad_norm": 3.453125, + "grad_norm_var": 0.05369364420572917, + "learning_rate": 0.0001, + "loss": 8.4285, + "loss/crossentropy": 2.2954673767089844, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2847554385662079, + "step": 1128 + }, + { + "epoch": 0.070625, + "grad_norm": 3.3125, + "grad_norm_var": 0.0545318603515625, + "learning_rate": 0.0001, + "loss": 8.4656, + "loss/crossentropy": 2.271798253059387, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2639819011092186, + "step": 1130 + }, + { + "epoch": 0.07075, + "grad_norm": 3.78125, + "grad_norm_var": 0.0674957275390625, + "learning_rate": 0.0001, + "loss": 8.6624, + "loss/crossentropy": 2.5219074487686157, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.30493326485157013, + "step": 1132 + }, + { + "epoch": 0.070875, + "grad_norm": 2.75, + "grad_norm_var": 0.09687093098958334, + "learning_rate": 0.0001, + "loss": 8.3251, + "loss/crossentropy": 2.419742465019226, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.29505568742752075, + "step": 1134 + }, + { + "epoch": 0.071, + "grad_norm": 3.734375, + "grad_norm_var": 0.1082916259765625, + "learning_rate": 0.0001, + "loss": 8.6851, + "loss/crossentropy": 2.460718870162964, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.2991577684879303, + "step": 1136 + }, + { + "epoch": 0.071125, + "grad_norm": 3.140625, + "grad_norm_var": 0.08157145182291667, + "learning_rate": 0.0001, + "loss": 8.5178, + "loss/crossentropy": 2.173828959465027, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.27509088814258575, + "step": 1138 + }, + { + "epoch": 0.07125, + "grad_norm": 3.109375, + "grad_norm_var": 0.07099202473958334, + "learning_rate": 0.0001, + "loss": 8.2813, + "loss/crossentropy": 2.0987906455993652, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.26560700684785843, + "step": 1140 + }, + { + "epoch": 0.071375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0721588134765625, + "learning_rate": 0.0001, + "loss": 8.5089, + "loss/crossentropy": 2.3777287006378174, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.27554096281528473, + "step": 1142 + }, + { + "epoch": 0.0715, + "grad_norm": 3.390625, + "grad_norm_var": 0.07099507649739584, + "learning_rate": 0.0001, + "loss": 8.3986, + "loss/crossentropy": 2.294643998146057, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.28792035579681396, + "step": 1144 + }, + { + "epoch": 0.071625, + "grad_norm": 3.3125, + "grad_norm_var": 0.07714436848958334, + "learning_rate": 0.0001, + "loss": 8.5452, + "loss/crossentropy": 2.3531687259674072, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.2780257761478424, + "step": 1146 + }, + { + "epoch": 0.07175, + "grad_norm": 3.15625, + "grad_norm_var": 0.06738993326822916, + "learning_rate": 0.0001, + "loss": 8.5518, + "loss/crossentropy": 2.0771710872650146, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26807837188243866, + "step": 1148 + }, + { + "epoch": 0.071875, + "grad_norm": 3.421875, + "grad_norm_var": 0.04797770182291667, + "learning_rate": 0.0001, + "loss": 8.5416, + "loss/crossentropy": 2.18049418926239, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.2739409804344177, + "step": 1150 + }, + { + "epoch": 0.072, + "grad_norm": 3.09375, + "grad_norm_var": 0.03434956868489583, + "learning_rate": 0.0001, + "loss": 8.4693, + "loss/crossentropy": 2.666857123374939, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.30170081555843353, + "step": 1152 + }, + { + "epoch": 0.072125, + "grad_norm": 3.171875, + "grad_norm_var": 0.06629130045572916, + "learning_rate": 0.0001, + "loss": 8.6786, + "loss/crossentropy": 2.5841041803359985, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.30311837792396545, + "step": 1154 + }, + { + "epoch": 0.07225, + "grad_norm": 3.171875, + "grad_norm_var": 0.0630767822265625, + "learning_rate": 0.0001, + "loss": 8.5017, + "loss/crossentropy": 2.2012165784835815, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.2716425508260727, + "step": 1156 + }, + { + "epoch": 0.072375, + "grad_norm": 3.171875, + "grad_norm_var": 0.06253255208333333, + "learning_rate": 0.0001, + "loss": 8.5665, + "loss/crossentropy": 2.2168599367141724, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.30834463238716125, + "step": 1158 + }, + { + "epoch": 0.0725, + "grad_norm": 4.15625, + "grad_norm_var": 0.1092193603515625, + "learning_rate": 0.0001, + "loss": 8.4126, + "loss/crossentropy": 2.1044594049453735, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2663833498954773, + "step": 1160 + }, + { + "epoch": 0.072625, + "grad_norm": 2.828125, + "grad_norm_var": 0.11414388020833334, + "learning_rate": 0.0001, + "loss": 8.3621, + "loss/crossentropy": 2.109754800796509, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.27043384313583374, + "step": 1162 + }, + { + "epoch": 0.07275, + "grad_norm": 3.53125, + "grad_norm_var": 0.1210845947265625, + "learning_rate": 0.0001, + "loss": 8.4745, + "loss/crossentropy": 2.384516477584839, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27273619174957275, + "step": 1164 + }, + { + "epoch": 0.072875, + "grad_norm": 2.984375, + "grad_norm_var": 0.1336090087890625, + "learning_rate": 0.0001, + "loss": 8.4642, + "loss/crossentropy": 2.214682459831238, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.27829450368881226, + "step": 1166 + }, + { + "epoch": 0.073, + "grad_norm": 3.4375, + "grad_norm_var": 0.12939046223958334, + "learning_rate": 0.0001, + "loss": 8.5784, + "loss/crossentropy": 2.276426315307617, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.29027001559734344, + "step": 1168 + }, + { + "epoch": 0.073125, + "grad_norm": 3.09375, + "grad_norm_var": 0.109130859375, + "learning_rate": 0.0001, + "loss": 8.353, + "loss/crossentropy": 2.1743494272232056, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.26005150377750397, + "step": 1170 + }, + { + "epoch": 0.07325, + "grad_norm": 3.421875, + "grad_norm_var": 0.11611226399739584, + "learning_rate": 0.0001, + "loss": 8.4193, + "loss/crossentropy": 2.279123902320862, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2806738466024399, + "step": 1172 + }, + { + "epoch": 0.073375, + "grad_norm": 3.390625, + "grad_norm_var": 0.11841532389322916, + "learning_rate": 0.0001, + "loss": 8.5702, + "loss/crossentropy": 2.050893008708954, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.253355473279953, + "step": 1174 + }, + { + "epoch": 0.0735, + "grad_norm": 3.375, + "grad_norm_var": 0.07099202473958334, + "learning_rate": 0.0001, + "loss": 8.7427, + "loss/crossentropy": 2.5286508798599243, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.31759728491306305, + "step": 1176 + }, + { + "epoch": 0.073625, + "grad_norm": 3.3125, + "grad_norm_var": 0.0608795166015625, + "learning_rate": 0.0001, + "loss": 8.6737, + "loss/crossentropy": 2.445157289505005, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.29573802649974823, + "step": 1178 + }, + { + "epoch": 0.07375, + "grad_norm": 3.46875, + "grad_norm_var": 0.05756734212239583, + "learning_rate": 0.0001, + "loss": 8.5287, + "loss/crossentropy": 2.3890769481658936, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2753005623817444, + "step": 1180 + }, + { + "epoch": 0.073875, + "grad_norm": 3.375, + "grad_norm_var": 0.052099609375, + "learning_rate": 0.0001, + "loss": 8.5552, + "loss/crossentropy": 2.2562392950057983, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.27074071764945984, + "step": 1182 + }, + { + "epoch": 0.074, + "grad_norm": 3.1875, + "grad_norm_var": 0.0451568603515625, + "learning_rate": 0.0001, + "loss": 8.4914, + "loss/crossentropy": 1.960713267326355, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.264203280210495, + "step": 1184 + }, + { + "epoch": 0.074125, + "grad_norm": 3.265625, + "grad_norm_var": 0.036279296875, + "learning_rate": 0.0001, + "loss": 8.4506, + "loss/crossentropy": 2.357021927833557, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.28563813865184784, + "step": 1186 + }, + { + "epoch": 0.07425, + "grad_norm": 3.125, + "grad_norm_var": 0.03333333333333333, + "learning_rate": 0.0001, + "loss": 8.3867, + "loss/crossentropy": 2.1780192852020264, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.26176655292510986, + "step": 1188 + }, + { + "epoch": 0.074375, + "grad_norm": 3.328125, + "grad_norm_var": 0.034228515625, + "learning_rate": 0.0001, + "loss": 8.4641, + "loss/crossentropy": 2.075110673904419, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.29416830837726593, + "step": 1190 + }, + { + "epoch": 0.0745, + "grad_norm": 3.09375, + "grad_norm_var": 0.0424468994140625, + "learning_rate": 0.0001, + "loss": 8.3866, + "loss/crossentropy": 2.152463436126709, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2740413099527359, + "step": 1192 + }, + { + "epoch": 0.074625, + "grad_norm": 3.28125, + "grad_norm_var": 0.039449055989583336, + "learning_rate": 0.0001, + "loss": 8.4003, + "loss/crossentropy": 2.381898283958435, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.29159918427467346, + "step": 1194 + }, + { + "epoch": 0.07475, + "grad_norm": 2.84375, + "grad_norm_var": 0.04258524576822917, + "learning_rate": 0.0001, + "loss": 8.5591, + "loss/crossentropy": 2.6784266233444214, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.30832037329673767, + "step": 1196 + }, + { + "epoch": 0.074875, + "grad_norm": 3.296875, + "grad_norm_var": 0.0288238525390625, + "learning_rate": 0.0001, + "loss": 8.477, + "loss/crossentropy": 2.1786980628967285, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.28018996119499207, + "step": 1198 + }, + { + "epoch": 0.075, + "grad_norm": 3.234375, + "grad_norm_var": 0.0292877197265625, + "learning_rate": 0.0001, + "loss": 8.6585, + "loss/crossentropy": 2.248537063598633, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28961239755153656, + "step": 1200 + }, + { + "epoch": 0.075125, + "grad_norm": 3.140625, + "grad_norm_var": 0.029621378580729166, + "learning_rate": 0.0001, + "loss": 8.4601, + "loss/crossentropy": 2.343456268310547, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2769605219364166, + "step": 1202 + }, + { + "epoch": 0.07525, + "grad_norm": 3.140625, + "grad_norm_var": 0.0279449462890625, + "learning_rate": 0.0001, + "loss": 8.5551, + "loss/crossentropy": 2.3451120853424072, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.31704986095428467, + "step": 1204 + }, + { + "epoch": 0.075375, + "grad_norm": 4.125, + "grad_norm_var": 0.07421875, + "learning_rate": 0.0001, + "loss": 8.2784, + "loss/crossentropy": 2.2383298873901367, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2643394321203232, + "step": 1206 + }, + { + "epoch": 0.0755, + "grad_norm": 3.9375, + "grad_norm_var": 0.10383707682291667, + "learning_rate": 0.0001, + "loss": 8.6391, + "loss/crossentropy": 2.0706650018692017, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2589013874530792, + "step": 1208 + }, + { + "epoch": 0.075625, + "grad_norm": 3.375, + "grad_norm_var": 0.10327046712239583, + "learning_rate": 0.0001, + "loss": 8.6296, + "loss/crossentropy": 2.264005422592163, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.28355173766613007, + "step": 1210 + }, + { + "epoch": 0.07575, + "grad_norm": 3.078125, + "grad_norm_var": 0.09348042805989583, + "learning_rate": 0.0001, + "loss": 8.3618, + "loss/crossentropy": 2.4925496578216553, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2993692457675934, + "step": 1212 + }, + { + "epoch": 0.075875, + "grad_norm": 3.375, + "grad_norm_var": 0.09510091145833334, + "learning_rate": 0.0001, + "loss": 8.5563, + "loss/crossentropy": 2.339760661125183, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27024510502815247, + "step": 1214 + }, + { + "epoch": 0.076, + "grad_norm": 3.1875, + "grad_norm_var": 0.10005594889322916, + "learning_rate": 0.0001, + "loss": 8.5717, + "loss/crossentropy": 2.357187867164612, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.3173932731151581, + "step": 1216 + }, + { + "epoch": 0.076125, + "grad_norm": 3.140625, + "grad_norm_var": 0.10143229166666666, + "learning_rate": 0.0001, + "loss": 8.3378, + "loss/crossentropy": 2.5177258253097534, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.29622797667980194, + "step": 1218 + }, + { + "epoch": 0.07625, + "grad_norm": 3.25, + "grad_norm_var": 0.098779296875, + "learning_rate": 0.0001, + "loss": 8.4854, + "loss/crossentropy": 2.0992863178253174, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2855496108531952, + "step": 1220 + }, + { + "epoch": 0.076375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0710601806640625, + "learning_rate": 0.0001, + "loss": 8.3916, + "loss/crossentropy": 2.4129068851470947, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.280709832906723, + "step": 1222 + }, + { + "epoch": 0.0765, + "grad_norm": 3.359375, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 0.0001, + "loss": 8.3373, + "loss/crossentropy": 2.0929447412490845, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.25636833161115646, + "step": 1224 + }, + { + "epoch": 0.076625, + "grad_norm": 3.0, + "grad_norm_var": 0.027326456705729165, + "learning_rate": 0.0001, + "loss": 8.4648, + "loss/crossentropy": 2.427080750465393, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2882840186357498, + "step": 1226 + }, + { + "epoch": 0.07675, + "grad_norm": 3.140625, + "grad_norm_var": 0.026585896809895832, + "learning_rate": 0.0001, + "loss": 8.3397, + "loss/crossentropy": 2.4737452268600464, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.28378529846668243, + "step": 1228 + }, + { + "epoch": 0.076875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0234771728515625, + "learning_rate": 0.0001, + "loss": 8.6966, + "loss/crossentropy": 2.6298500299453735, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.306839257478714, + "step": 1230 + }, + { + "epoch": 0.077, + "grad_norm": 2.859375, + "grad_norm_var": 0.03771158854166667, + "learning_rate": 0.0001, + "loss": 8.4196, + "loss/crossentropy": 2.3488998413085938, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.27983449399471283, + "step": 1232 + }, + { + "epoch": 0.077125, + "grad_norm": 3.828125, + "grad_norm_var": 0.06613667805989583, + "learning_rate": 0.0001, + "loss": 8.5206, + "loss/crossentropy": 2.2696053981781006, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2854994237422943, + "step": 1234 + }, + { + "epoch": 0.07725, + "grad_norm": 2.765625, + "grad_norm_var": 0.07351888020833333, + "learning_rate": 0.0001, + "loss": 8.2717, + "loss/crossentropy": 2.174792766571045, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2614743113517761, + "step": 1236 + }, + { + "epoch": 0.077375, + "grad_norm": 3.921875, + "grad_norm_var": 0.09851786295572916, + "learning_rate": 0.0001, + "loss": 8.6137, + "loss/crossentropy": 2.2606674432754517, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.28789395093917847, + "step": 1238 + }, + { + "epoch": 0.0775, + "grad_norm": 3.921875, + "grad_norm_var": 0.12114969889322917, + "learning_rate": 0.0001, + "loss": 8.4582, + "loss/crossentropy": 2.104749917984009, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.27152082324028015, + "step": 1240 + }, + { + "epoch": 0.077625, + "grad_norm": 3.28125, + "grad_norm_var": 0.1411041259765625, + "learning_rate": 0.0001, + "loss": 8.3861, + "loss/crossentropy": 2.198368549346924, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.26415789127349854, + "step": 1242 + }, + { + "epoch": 0.07775, + "grad_norm": 3.015625, + "grad_norm_var": 0.1497711181640625, + "learning_rate": 0.0001, + "loss": 8.5154, + "loss/crossentropy": 2.4816545248031616, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.30805790424346924, + "step": 1244 + }, + { + "epoch": 0.077875, + "grad_norm": 3.515625, + "grad_norm_var": 0.1504547119140625, + "learning_rate": 0.0001, + "loss": 8.5166, + "loss/crossentropy": 2.4640896320343018, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.29878415167331696, + "step": 1246 + }, + { + "epoch": 0.078, + "grad_norm": 2.796875, + "grad_norm_var": 0.15390218098958333, + "learning_rate": 0.0001, + "loss": 8.3599, + "loss/crossentropy": 2.2658848762512207, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.264192171394825, + "step": 1248 + }, + { + "epoch": 0.078125, + "grad_norm": 3.5, + "grad_norm_var": 0.142724609375, + "learning_rate": 0.0001, + "loss": 8.3164, + "loss/crossentropy": 2.1921703815460205, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2650914490222931, + "step": 1250 + }, + { + "epoch": 0.07825, + "grad_norm": 2.921875, + "grad_norm_var": 0.1355865478515625, + "learning_rate": 0.0001, + "loss": 8.5194, + "loss/crossentropy": 2.2807745933532715, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.28195033967494965, + "step": 1252 + }, + { + "epoch": 0.078375, + "grad_norm": 3.34375, + "grad_norm_var": 0.12167561848958333, + "learning_rate": 0.0001, + "loss": 8.4772, + "loss/crossentropy": 2.145294189453125, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2704995721578598, + "step": 1254 + }, + { + "epoch": 0.0785, + "grad_norm": 2.859375, + "grad_norm_var": 0.1064849853515625, + "learning_rate": 0.0001, + "loss": 8.2792, + "loss/crossentropy": 2.3187735080718994, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.28507962822914124, + "step": 1256 + }, + { + "epoch": 0.078625, + "grad_norm": 3.734375, + "grad_norm_var": 0.08872782389322917, + "learning_rate": 0.0001, + "loss": 8.4454, + "loss/crossentropy": 2.226912260055542, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2773582488298416, + "step": 1258 + }, + { + "epoch": 0.07875, + "grad_norm": 2.921875, + "grad_norm_var": 0.08684488932291666, + "learning_rate": 0.0001, + "loss": 8.4253, + "loss/crossentropy": 2.524247169494629, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.28448525071144104, + "step": 1260 + }, + { + "epoch": 0.078875, + "grad_norm": 3.109375, + "grad_norm_var": 0.08153889973958334, + "learning_rate": 0.0001, + "loss": 8.4428, + "loss/crossentropy": 2.434785842895508, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2782677710056305, + "step": 1262 + }, + { + "epoch": 0.079, + "grad_norm": 3.171875, + "grad_norm_var": 0.09170633951822917, + "learning_rate": 0.0001, + "loss": 8.3836, + "loss/crossentropy": 2.0830533504486084, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27287817001342773, + "step": 1264 + }, + { + "epoch": 0.079125, + "grad_norm": 3.265625, + "grad_norm_var": 0.09251200358072917, + "learning_rate": 0.0001, + "loss": 8.3748, + "loss/crossentropy": 2.2747669219970703, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.3025789111852646, + "step": 1266 + }, + { + "epoch": 0.07925, + "grad_norm": 3.140625, + "grad_norm_var": 0.07610270182291666, + "learning_rate": 0.0001, + "loss": 8.4631, + "loss/crossentropy": 2.2862067222595215, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2721339762210846, + "step": 1268 + }, + { + "epoch": 0.079375, + "grad_norm": 3.21875, + "grad_norm_var": 0.07183837890625, + "learning_rate": 0.0001, + "loss": 8.4595, + "loss/crossentropy": 2.3111391067504883, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2573637366294861, + "step": 1270 + }, + { + "epoch": 0.0795, + "grad_norm": 3.0625, + "grad_norm_var": 0.06552632649739583, + "learning_rate": 0.0001, + "loss": 8.3966, + "loss/crossentropy": 2.5623552799224854, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.26713909208774567, + "step": 1272 + }, + { + "epoch": 0.079625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04524637858072917, + "learning_rate": 0.0001, + "loss": 8.2061, + "loss/crossentropy": 2.1350300312042236, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.26287516951560974, + "step": 1274 + }, + { + "epoch": 0.07975, + "grad_norm": 3.046875, + "grad_norm_var": 0.042740885416666666, + "learning_rate": 0.0001, + "loss": 8.5091, + "loss/crossentropy": 2.3671988248825073, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.274411678314209, + "step": 1276 + }, + { + "epoch": 0.079875, + "grad_norm": 3.109375, + "grad_norm_var": 0.034989420572916666, + "learning_rate": 0.0001, + "loss": 8.4391, + "loss/crossentropy": 2.280188202857971, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2855434864759445, + "step": 1278 + }, + { + "epoch": 0.08, + "grad_norm": 3.34375, + "grad_norm_var": 0.013997395833333334, + "learning_rate": 0.0001, + "loss": 8.561, + "loss/crossentropy": 2.09197735786438, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2575538009405136, + "step": 1280 + }, + { + "epoch": 0.080125, + "grad_norm": 2.859375, + "grad_norm_var": 0.0140045166015625, + "learning_rate": 0.0001, + "loss": 8.374, + "loss/crossentropy": 2.199427366256714, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2671803832054138, + "step": 1282 + }, + { + "epoch": 0.08025, + "grad_norm": 3.15625, + "grad_norm_var": 0.016792805989583333, + "learning_rate": 0.0001, + "loss": 8.4494, + "loss/crossentropy": 2.646793484687805, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.26508933305740356, + "step": 1284 + }, + { + "epoch": 0.080375, + "grad_norm": 3.125, + "grad_norm_var": 0.017378743489583334, + "learning_rate": 0.0001, + "loss": 8.3257, + "loss/crossentropy": 2.255902647972107, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26800450682640076, + "step": 1286 + }, + { + "epoch": 0.0805, + "grad_norm": 3.703125, + "grad_norm_var": 0.03821207682291667, + "learning_rate": 0.0001, + "loss": 8.6049, + "loss/crossentropy": 2.4666751623153687, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2575417757034302, + "step": 1288 + }, + { + "epoch": 0.080625, + "grad_norm": 3.234375, + "grad_norm_var": 0.038874308268229164, + "learning_rate": 0.0001, + "loss": 8.4277, + "loss/crossentropy": 2.2299275398254395, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.26189981400966644, + "step": 1290 + }, + { + "epoch": 0.08075, + "grad_norm": 3.234375, + "grad_norm_var": 0.0371978759765625, + "learning_rate": 0.0001, + "loss": 8.3189, + "loss/crossentropy": 2.3310989141464233, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.26487114280462265, + "step": 1292 + }, + { + "epoch": 0.080875, + "grad_norm": 3.6875, + "grad_norm_var": 0.05325520833333333, + "learning_rate": 0.0001, + "loss": 8.4445, + "loss/crossentropy": 2.2357208728790283, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27044905722141266, + "step": 1294 + }, + { + "epoch": 0.081, + "grad_norm": 2.828125, + "grad_norm_var": 0.0831451416015625, + "learning_rate": 0.0001, + "loss": 8.2945, + "loss/crossentropy": 2.4127997159957886, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2716076225042343, + "step": 1296 + }, + { + "epoch": 0.081125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0810546875, + "learning_rate": 0.0001, + "loss": 8.1039, + "loss/crossentropy": 2.271215081214905, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.26447129994630814, + "step": 1298 + }, + { + "epoch": 0.08125, + "grad_norm": 2.953125, + "grad_norm_var": 0.08394266764322916, + "learning_rate": 0.0001, + "loss": 8.4898, + "loss/crossentropy": 2.214189291000366, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.26683974266052246, + "step": 1300 + }, + { + "epoch": 0.081375, + "grad_norm": 3.03125, + "grad_norm_var": 0.09244791666666667, + "learning_rate": 0.0001, + "loss": 8.4019, + "loss/crossentropy": 2.1998738050460815, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2771088480949402, + "step": 1302 + }, + { + "epoch": 0.0815, + "grad_norm": 3.515625, + "grad_norm_var": 0.08205464680989584, + "learning_rate": 0.0001, + "loss": 8.391, + "loss/crossentropy": 2.10916006565094, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.26495426893234253, + "step": 1304 + }, + { + "epoch": 0.081625, + "grad_norm": 3.015625, + "grad_norm_var": 0.111962890625, + "learning_rate": 0.0001, + "loss": 8.4097, + "loss/crossentropy": 2.428833842277527, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.28503939509391785, + "step": 1306 + }, + { + "epoch": 0.08175, + "grad_norm": 3.046875, + "grad_norm_var": 0.1131988525390625, + "learning_rate": 0.0001, + "loss": 8.441, + "loss/crossentropy": 2.512449622154236, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.33712296187877655, + "step": 1308 + }, + { + "epoch": 0.081875, + "grad_norm": 3.28125, + "grad_norm_var": 0.088330078125, + "learning_rate": 0.0001, + "loss": 8.3889, + "loss/crossentropy": 2.438145875930786, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2805769294500351, + "step": 1310 + }, + { + "epoch": 0.082, + "grad_norm": 3.03125, + "grad_norm_var": 0.0719390869140625, + "learning_rate": 0.0001, + "loss": 8.2675, + "loss/crossentropy": 2.35923433303833, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2907231003046036, + "step": 1312 + }, + { + "epoch": 0.082125, + "grad_norm": 3.125, + "grad_norm_var": 0.07124735514322916, + "learning_rate": 0.0001, + "loss": 8.3908, + "loss/crossentropy": 2.288873791694641, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2815335690975189, + "step": 1314 + }, + { + "epoch": 0.08225, + "grad_norm": 3.125, + "grad_norm_var": 0.0664459228515625, + "learning_rate": 0.0001, + "loss": 8.4692, + "loss/crossentropy": 2.4051743745803833, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.282541960477829, + "step": 1316 + }, + { + "epoch": 0.082375, + "grad_norm": 3.734375, + "grad_norm_var": 0.0790924072265625, + "learning_rate": 0.0001, + "loss": 8.2673, + "loss/crossentropy": 2.115163564682007, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26003655791282654, + "step": 1318 + }, + { + "epoch": 0.0825, + "grad_norm": 2.984375, + "grad_norm_var": 0.07752278645833334, + "learning_rate": 0.0001, + "loss": 8.3407, + "loss/crossentropy": 2.294703960418701, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2557590380311012, + "step": 1320 + }, + { + "epoch": 0.082625, + "grad_norm": 3.140625, + "grad_norm_var": 0.041356404622395836, + "learning_rate": 0.0001, + "loss": 8.3636, + "loss/crossentropy": 2.6248332262039185, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2877664119005203, + "step": 1322 + }, + { + "epoch": 0.08275, + "grad_norm": 3.234375, + "grad_norm_var": 0.04487202962239583, + "learning_rate": 0.0001, + "loss": 8.392, + "loss/crossentropy": 2.212457776069641, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2759707272052765, + "step": 1324 + }, + { + "epoch": 0.082875, + "grad_norm": 3.015625, + "grad_norm_var": 0.0431304931640625, + "learning_rate": 0.0001, + "loss": 8.3794, + "loss/crossentropy": 1.8898176550865173, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.28947535157203674, + "step": 1326 + }, + { + "epoch": 0.083, + "grad_norm": 3.1875, + "grad_norm_var": 0.04039713541666667, + "learning_rate": 0.0001, + "loss": 8.4226, + "loss/crossentropy": 2.3176772594451904, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.30063313245773315, + "step": 1328 + }, + { + "epoch": 0.083125, + "grad_norm": 3.421875, + "grad_norm_var": 0.04169514973958333, + "learning_rate": 0.0001, + "loss": 8.033, + "loss/crossentropy": 1.9844502806663513, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.22963125258684158, + "step": 1330 + }, + { + "epoch": 0.08325, + "grad_norm": 2.984375, + "grad_norm_var": 0.04394124348958333, + "learning_rate": 0.0001, + "loss": 8.3364, + "loss/crossentropy": 2.0860679745674133, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.28032663464546204, + "step": 1332 + }, + { + "epoch": 0.083375, + "grad_norm": 3.171875, + "grad_norm_var": 0.028120930989583334, + "learning_rate": 0.0001, + "loss": 8.2641, + "loss/crossentropy": 2.2230706214904785, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26876039803028107, + "step": 1334 + }, + { + "epoch": 0.0835, + "grad_norm": 3.28125, + "grad_norm_var": 0.026904296875, + "learning_rate": 0.0001, + "loss": 8.4189, + "loss/crossentropy": 2.307973623275757, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2883561700582504, + "step": 1336 + }, + { + "epoch": 0.083625, + "grad_norm": 3.390625, + "grad_norm_var": 0.0290679931640625, + "learning_rate": 0.0001, + "loss": 8.292, + "loss/crossentropy": 2.3308621644973755, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26050567626953125, + "step": 1338 + }, + { + "epoch": 0.08375, + "grad_norm": 3.40625, + "grad_norm_var": 0.03827718098958333, + "learning_rate": 0.0001, + "loss": 8.2063, + "loss/crossentropy": 2.1553597450256348, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27626167237758636, + "step": 1340 + }, + { + "epoch": 0.083875, + "grad_norm": 3.09375, + "grad_norm_var": 0.08676656087239583, + "learning_rate": 0.0001, + "loss": 8.5019, + "loss/crossentropy": 2.21061909198761, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.27770161628723145, + "step": 1342 + }, + { + "epoch": 0.084, + "grad_norm": 2.734375, + "grad_norm_var": 0.1104400634765625, + "learning_rate": 0.0001, + "loss": 8.222, + "loss/crossentropy": 2.2863982915878296, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2523474544286728, + "step": 1344 + }, + { + "epoch": 0.084125, + "grad_norm": 3.09375, + "grad_norm_var": 0.11259358723958333, + "learning_rate": 0.0001, + "loss": 8.307, + "loss/crossentropy": 2.4110026359558105, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.26567649841308594, + "step": 1346 + }, + { + "epoch": 0.08425, + "grad_norm": 3.265625, + "grad_norm_var": 0.10590718587239584, + "learning_rate": 0.0001, + "loss": 8.3584, + "loss/crossentropy": 2.3141287565231323, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2731921225786209, + "step": 1348 + }, + { + "epoch": 0.084375, + "grad_norm": 3.140625, + "grad_norm_var": 0.09840087890625, + "learning_rate": 0.0001, + "loss": 8.3221, + "loss/crossentropy": 2.4939377307891846, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2732496112585068, + "step": 1350 + }, + { + "epoch": 0.0845, + "grad_norm": 3.09375, + "grad_norm_var": 0.09888916015625, + "learning_rate": 0.0001, + "loss": 8.4167, + "loss/crossentropy": 2.6840182542800903, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2915331721305847, + "step": 1352 + }, + { + "epoch": 0.084625, + "grad_norm": 2.984375, + "grad_norm_var": 0.1009674072265625, + "learning_rate": 0.0001, + "loss": 8.361, + "loss/crossentropy": 2.3241279125213623, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.25332190841436386, + "step": 1354 + }, + { + "epoch": 0.08475, + "grad_norm": 2.96875, + "grad_norm_var": 0.09444071451822916, + "learning_rate": 0.0001, + "loss": 8.128, + "loss/crossentropy": 2.2505773305892944, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.27977539598941803, + "step": 1356 + }, + { + "epoch": 0.084875, + "grad_norm": 3.609375, + "grad_norm_var": 0.04345296223958333, + "learning_rate": 0.0001, + "loss": 8.383, + "loss/crossentropy": 2.4839106798171997, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2985747307538986, + "step": 1358 + }, + { + "epoch": 0.085, + "grad_norm": 3.046875, + "grad_norm_var": 0.03417561848958333, + "learning_rate": 0.0001, + "loss": 8.3302, + "loss/crossentropy": 2.1147927045822144, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26798177510499954, + "step": 1360 + }, + { + "epoch": 0.085125, + "grad_norm": 3.03125, + "grad_norm_var": 0.03313395182291667, + "learning_rate": 0.0001, + "loss": 8.2878, + "loss/crossentropy": 2.341870665550232, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.29636865854263306, + "step": 1362 + }, + { + "epoch": 0.08525, + "grad_norm": 3.296875, + "grad_norm_var": 0.033772786458333336, + "learning_rate": 0.0001, + "loss": 8.3404, + "loss/crossentropy": 2.3820383548736572, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.308891698718071, + "step": 1364 + }, + { + "epoch": 0.085375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03592122395833333, + "learning_rate": 0.0001, + "loss": 8.2509, + "loss/crossentropy": 2.406251907348633, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2710695117712021, + "step": 1366 + }, + { + "epoch": 0.0855, + "grad_norm": 3.09375, + "grad_norm_var": 0.04088541666666667, + "learning_rate": 0.0001, + "loss": 8.4671, + "loss/crossentropy": 2.4824094772338867, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2687048017978668, + "step": 1368 + }, + { + "epoch": 0.085625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0371490478515625, + "learning_rate": 0.0001, + "loss": 8.2799, + "loss/crossentropy": 2.1223180890083313, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.25326162576675415, + "step": 1370 + }, + { + "epoch": 0.08575, + "grad_norm": 3.015625, + "grad_norm_var": 0.0335113525390625, + "learning_rate": 0.0001, + "loss": 8.2298, + "loss/crossentropy": 2.3254255056381226, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28770148754119873, + "step": 1372 + }, + { + "epoch": 0.085875, + "grad_norm": 3.0, + "grad_norm_var": 0.018928019205729167, + "learning_rate": 0.0001, + "loss": 8.405, + "loss/crossentropy": 2.369110345840454, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2755488455295563, + "step": 1374 + }, + { + "epoch": 0.086, + "grad_norm": 2.90625, + "grad_norm_var": 0.018919881184895834, + "learning_rate": 0.0001, + "loss": 8.1925, + "loss/crossentropy": 2.5451020002365112, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.27585017681121826, + "step": 1376 + }, + { + "epoch": 0.086125, + "grad_norm": 3.109375, + "grad_norm_var": 0.018876139322916666, + "learning_rate": 0.0001, + "loss": 8.5378, + "loss/crossentropy": 2.5744348764419556, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28956979513168335, + "step": 1378 + }, + { + "epoch": 0.08625, + "grad_norm": 3.046875, + "grad_norm_var": 0.014729817708333334, + "learning_rate": 0.0001, + "loss": 8.3531, + "loss/crossentropy": 2.0667566061019897, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2600719928741455, + "step": 1380 + }, + { + "epoch": 0.086375, + "grad_norm": 3.1875, + "grad_norm_var": 0.017464192708333333, + "learning_rate": 0.0001, + "loss": 8.3839, + "loss/crossentropy": 2.383557438850403, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.279419407248497, + "step": 1382 + }, + { + "epoch": 0.0865, + "grad_norm": 3.125, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 8.3527, + "loss/crossentropy": 2.262708902359009, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.26019924879074097, + "step": 1384 + }, + { + "epoch": 0.086625, + "grad_norm": 3.28125, + "grad_norm_var": 0.01695556640625, + "learning_rate": 0.0001, + "loss": 8.4089, + "loss/crossentropy": 2.422199249267578, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.257827490568161, + "step": 1386 + }, + { + "epoch": 0.08675, + "grad_norm": 3.09375, + "grad_norm_var": 0.015653483072916665, + "learning_rate": 0.0001, + "loss": 8.2639, + "loss/crossentropy": 2.341743230819702, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2593771815299988, + "step": 1388 + }, + { + "epoch": 0.086875, + "grad_norm": 3.828125, + "grad_norm_var": 0.042578125, + "learning_rate": 0.0001, + "loss": 8.2408, + "loss/crossentropy": 2.2627909183502197, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.262659452855587, + "step": 1390 + }, + { + "epoch": 0.087, + "grad_norm": 2.84375, + "grad_norm_var": 0.05591532389322917, + "learning_rate": 0.0001, + "loss": 8.202, + "loss/crossentropy": 2.2393475770950317, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2788470536470413, + "step": 1392 + }, + { + "epoch": 0.087125, + "grad_norm": 3.4375, + "grad_norm_var": 0.06106363932291667, + "learning_rate": 0.0001, + "loss": 8.468, + "loss/crossentropy": 2.256345748901367, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2531740814447403, + "step": 1394 + }, + { + "epoch": 0.08725, + "grad_norm": 3.359375, + "grad_norm_var": 0.06864827473958333, + "learning_rate": 0.0001, + "loss": 8.3001, + "loss/crossentropy": 2.263739228248596, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.26853087544441223, + "step": 1396 + }, + { + "epoch": 0.087375, + "grad_norm": 3.125, + "grad_norm_var": 0.067919921875, + "learning_rate": 0.0001, + "loss": 8.2254, + "loss/crossentropy": 2.3907299041748047, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26690760254859924, + "step": 1398 + }, + { + "epoch": 0.0875, + "grad_norm": 3.15625, + "grad_norm_var": 0.06586812337239584, + "learning_rate": 0.0001, + "loss": 8.4731, + "loss/crossentropy": 2.3437803983688354, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.259974405169487, + "step": 1400 + }, + { + "epoch": 0.087625, + "grad_norm": 2.90625, + "grad_norm_var": 0.06672261555989584, + "learning_rate": 0.0001, + "loss": 8.4484, + "loss/crossentropy": 2.3048956394195557, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2741893529891968, + "step": 1402 + }, + { + "epoch": 0.08775, + "grad_norm": 3.15625, + "grad_norm_var": 0.06726888020833334, + "learning_rate": 0.0001, + "loss": 8.3698, + "loss/crossentropy": 2.3523448705673218, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.280557781457901, + "step": 1404 + }, + { + "epoch": 0.087875, + "grad_norm": 3.25, + "grad_norm_var": 0.033524576822916666, + "learning_rate": 0.0001, + "loss": 8.3848, + "loss/crossentropy": 2.3498148918151855, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3289744406938553, + "step": 1406 + }, + { + "epoch": 0.088, + "grad_norm": 3.9375, + "grad_norm_var": 0.06220296223958333, + "learning_rate": 0.0001, + "loss": 8.2261, + "loss/crossentropy": 2.071715295314789, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.26693500578403473, + "step": 1408 + }, + { + "epoch": 0.088125, + "grad_norm": 3.421875, + "grad_norm_var": 0.0634429931640625, + "learning_rate": 0.0001, + "loss": 8.2024, + "loss/crossentropy": 2.333922863006592, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27316156029701233, + "step": 1410 + }, + { + "epoch": 0.08825, + "grad_norm": 3.265625, + "grad_norm_var": 0.05562744140625, + "learning_rate": 0.0001, + "loss": 8.0937, + "loss/crossentropy": 2.1679932475090027, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.25762278586626053, + "step": 1412 + }, + { + "epoch": 0.088375, + "grad_norm": 2.96875, + "grad_norm_var": 0.0583648681640625, + "learning_rate": 0.0001, + "loss": 8.2188, + "loss/crossentropy": 2.3344188928604126, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.32225053012371063, + "step": 1414 + }, + { + "epoch": 0.0885, + "grad_norm": 3.40625, + "grad_norm_var": 0.06533915201822917, + "learning_rate": 0.0001, + "loss": 8.2794, + "loss/crossentropy": 2.181140899658203, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.25673504173755646, + "step": 1416 + }, + { + "epoch": 0.088625, + "grad_norm": 3.015625, + "grad_norm_var": 0.0631988525390625, + "learning_rate": 0.0001, + "loss": 8.4926, + "loss/crossentropy": 2.5540969371795654, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.28567659854888916, + "step": 1418 + }, + { + "epoch": 0.08875, + "grad_norm": 3.0625, + "grad_norm_var": 0.13014322916666668, + "learning_rate": 0.0001, + "loss": 8.3927, + "loss/crossentropy": 2.308061122894287, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26119405031204224, + "step": 1420 + }, + { + "epoch": 0.088875, + "grad_norm": 4.125, + "grad_norm_var": 0.22221577962239583, + "learning_rate": 0.0001, + "loss": 8.3563, + "loss/crossentropy": 2.337175130844116, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28924381732940674, + "step": 1422 + }, + { + "epoch": 0.089, + "grad_norm": 3.0625, + "grad_norm_var": 0.205322265625, + "learning_rate": 0.0001, + "loss": 8.2574, + "loss/crossentropy": 2.383028507232666, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2829654812812805, + "step": 1424 + }, + { + "epoch": 0.089125, + "grad_norm": 2.8125, + "grad_norm_var": 0.22934468587239584, + "learning_rate": 0.0001, + "loss": 8.0822, + "loss/crossentropy": 2.2887638807296753, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2677062898874283, + "step": 1426 + }, + { + "epoch": 0.08925, + "grad_norm": 2.9375, + "grad_norm_var": 0.23911031087239584, + "learning_rate": 0.0001, + "loss": 8.2723, + "loss/crossentropy": 2.2472543716430664, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2706802934408188, + "step": 1428 + }, + { + "epoch": 0.089375, + "grad_norm": 3.3125, + "grad_norm_var": 0.23624674479166666, + "learning_rate": 0.0001, + "loss": 8.378, + "loss/crossentropy": 2.517896294593811, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.28536301851272583, + "step": 1430 + }, + { + "epoch": 0.0895, + "grad_norm": 2.984375, + "grad_norm_var": 0.23261311848958333, + "learning_rate": 0.0001, + "loss": 8.1376, + "loss/crossentropy": 2.118437886238098, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2512395307421684, + "step": 1432 + }, + { + "epoch": 0.089625, + "grad_norm": 3.1875, + "grad_norm_var": 0.22909749348958333, + "learning_rate": 0.0001, + "loss": 8.2629, + "loss/crossentropy": 2.177670121192932, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2840624302625656, + "step": 1434 + }, + { + "epoch": 0.08975, + "grad_norm": 3.28125, + "grad_norm_var": 0.16071675618489584, + "learning_rate": 0.0001, + "loss": 8.3297, + "loss/crossentropy": 2.422105073928833, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.28876344859600067, + "step": 1436 + }, + { + "epoch": 0.089875, + "grad_norm": 2.65625, + "grad_norm_var": 0.03298238118489583, + "learning_rate": 0.0001, + "loss": 8.3754, + "loss/crossentropy": 2.469232678413391, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2923481911420822, + "step": 1438 + }, + { + "epoch": 0.09, + "grad_norm": 3.03125, + "grad_norm_var": 0.0335845947265625, + "learning_rate": 0.0001, + "loss": 8.2403, + "loss/crossentropy": 2.3302817344665527, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.26953746378421783, + "step": 1440 + }, + { + "epoch": 0.090125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0303131103515625, + "learning_rate": 0.0001, + "loss": 8.2342, + "loss/crossentropy": 2.1770907640457153, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.26305052638053894, + "step": 1442 + }, + { + "epoch": 0.09025, + "grad_norm": 3.03125, + "grad_norm_var": 0.0291412353515625, + "learning_rate": 0.0001, + "loss": 8.2963, + "loss/crossentropy": 2.244715094566345, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2524953857064247, + "step": 1444 + }, + { + "epoch": 0.090375, + "grad_norm": 3.53125, + "grad_norm_var": 0.0806549072265625, + "learning_rate": 0.0001, + "loss": 8.2455, + "loss/crossentropy": 2.1872771978378296, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.28185485303401947, + "step": 1446 + }, + { + "epoch": 0.0905, + "grad_norm": 2.796875, + "grad_norm_var": 0.09325764973958334, + "learning_rate": 0.0001, + "loss": 8.1763, + "loss/crossentropy": 2.388404607772827, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.25562550872564316, + "step": 1448 + }, + { + "epoch": 0.090625, + "grad_norm": 3.34375, + "grad_norm_var": 0.11158854166666667, + "learning_rate": 0.0001, + "loss": 8.2801, + "loss/crossentropy": 2.2114795446395874, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2575107663869858, + "step": 1450 + }, + { + "epoch": 0.09075, + "grad_norm": 2.859375, + "grad_norm_var": 0.11750895182291667, + "learning_rate": 0.0001, + "loss": 8.4195, + "loss/crossentropy": 2.268153190612793, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2590959519147873, + "step": 1452 + }, + { + "epoch": 0.090875, + "grad_norm": 3.0, + "grad_norm_var": 0.10434468587239583, + "learning_rate": 0.0001, + "loss": 8.1186, + "loss/crossentropy": 2.3246419429779053, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27099407464265823, + "step": 1454 + }, + { + "epoch": 0.091, + "grad_norm": 3.046875, + "grad_norm_var": 0.10079752604166667, + "learning_rate": 0.0001, + "loss": 8.3338, + "loss/crossentropy": 2.5052762031555176, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.29280832409858704, + "step": 1456 + }, + { + "epoch": 0.091125, + "grad_norm": 3.15625, + "grad_norm_var": 0.09954325358072917, + "learning_rate": 0.0001, + "loss": 8.3336, + "loss/crossentropy": 1.9449425339698792, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.275674507021904, + "step": 1458 + }, + { + "epoch": 0.09125, + "grad_norm": 2.8125, + "grad_norm_var": 0.10442301432291666, + "learning_rate": 0.0001, + "loss": 8.3734, + "loss/crossentropy": 2.3953585624694824, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.28133782744407654, + "step": 1460 + }, + { + "epoch": 0.091375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0473785400390625, + "learning_rate": 0.0001, + "loss": 8.0915, + "loss/crossentropy": 2.2441056966781616, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2531541734933853, + "step": 1462 + }, + { + "epoch": 0.0915, + "grad_norm": 3.0, + "grad_norm_var": 0.04156494140625, + "learning_rate": 0.0001, + "loss": 8.4787, + "loss/crossentropy": 2.4373109340667725, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2879520505666733, + "step": 1464 + }, + { + "epoch": 0.091625, + "grad_norm": 2.96875, + "grad_norm_var": 0.014850870768229166, + "learning_rate": 0.0001, + "loss": 8.101, + "loss/crossentropy": 2.2889362573623657, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.25555726885795593, + "step": 1466 + }, + { + "epoch": 0.09175, + "grad_norm": 3.171875, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 0.0001, + "loss": 8.1266, + "loss/crossentropy": 2.2097198963165283, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27271226048469543, + "step": 1468 + }, + { + "epoch": 0.091875, + "grad_norm": 3.171875, + "grad_norm_var": 0.020963541666666665, + "learning_rate": 0.0001, + "loss": 8.2212, + "loss/crossentropy": 2.181049108505249, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26682595908641815, + "step": 1470 + }, + { + "epoch": 0.092, + "grad_norm": 3.03125, + "grad_norm_var": 0.022163899739583333, + "learning_rate": 0.0001, + "loss": 8.2866, + "loss/crossentropy": 2.311566710472107, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.24793966859579086, + "step": 1472 + }, + { + "epoch": 0.092125, + "grad_norm": 3.203125, + "grad_norm_var": 0.022704060872395834, + "learning_rate": 0.0001, + "loss": 8.178, + "loss/crossentropy": 2.359419822692871, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.26220113039016724, + "step": 1474 + }, + { + "epoch": 0.09225, + "grad_norm": 3.21875, + "grad_norm_var": 0.022001139322916665, + "learning_rate": 0.0001, + "loss": 8.4216, + "loss/crossentropy": 2.547469735145569, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2959955930709839, + "step": 1476 + }, + { + "epoch": 0.092375, + "grad_norm": 3.03125, + "grad_norm_var": 0.021100870768229165, + "learning_rate": 0.0001, + "loss": 8.2998, + "loss/crossentropy": 2.4008055925369263, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2753802388906479, + "step": 1478 + }, + { + "epoch": 0.0925, + "grad_norm": 2.671875, + "grad_norm_var": 0.028837076822916665, + "learning_rate": 0.0001, + "loss": 8.0442, + "loss/crossentropy": 2.049844443798065, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.25805267691612244, + "step": 1480 + }, + { + "epoch": 0.092625, + "grad_norm": 3.078125, + "grad_norm_var": 0.029100545247395835, + "learning_rate": 0.0001, + "loss": 8.3892, + "loss/crossentropy": 2.3216545581817627, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.27284783124923706, + "step": 1482 + }, + { + "epoch": 0.09275, + "grad_norm": 2.859375, + "grad_norm_var": 0.0282867431640625, + "learning_rate": 0.0001, + "loss": 8.2379, + "loss/crossentropy": 2.3917791843414307, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.26631103456020355, + "step": 1484 + }, + { + "epoch": 0.092875, + "grad_norm": 3.21875, + "grad_norm_var": 0.028125, + "learning_rate": 0.0001, + "loss": 8.5063, + "loss/crossentropy": 2.536360025405884, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2958754599094391, + "step": 1486 + }, + { + "epoch": 0.093, + "grad_norm": 2.984375, + "grad_norm_var": 0.0410308837890625, + "learning_rate": 0.0001, + "loss": 8.3401, + "loss/crossentropy": 2.387327551841736, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28280311822891235, + "step": 1488 + }, + { + "epoch": 0.093125, + "grad_norm": 3.578125, + "grad_norm_var": 0.05903218587239583, + "learning_rate": 0.0001, + "loss": 8.3749, + "loss/crossentropy": 2.4892961978912354, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.29599107801914215, + "step": 1490 + }, + { + "epoch": 0.09325, + "grad_norm": 3.21875, + "grad_norm_var": 0.0744781494140625, + "learning_rate": 0.0001, + "loss": 8.2995, + "loss/crossentropy": 2.233021378517151, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.269522100687027, + "step": 1492 + }, + { + "epoch": 0.093375, + "grad_norm": 2.90625, + "grad_norm_var": 0.08401285807291667, + "learning_rate": 0.0001, + "loss": 8.1287, + "loss/crossentropy": 2.2778061628341675, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2637213170528412, + "step": 1494 + }, + { + "epoch": 0.0935, + "grad_norm": 3.21875, + "grad_norm_var": 0.08079020182291667, + "learning_rate": 0.0001, + "loss": 8.5223, + "loss/crossentropy": 2.364134907722473, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2790801376104355, + "step": 1496 + }, + { + "epoch": 0.093625, + "grad_norm": 2.734375, + "grad_norm_var": 0.09123942057291666, + "learning_rate": 0.0001, + "loss": 8.1725, + "loss/crossentropy": 2.0337949991226196, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.244078166782856, + "step": 1498 + }, + { + "epoch": 0.09375, + "grad_norm": 3.03125, + "grad_norm_var": 0.08502197265625, + "learning_rate": 0.0001, + "loss": 8.3314, + "loss/crossentropy": 2.307387113571167, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2793863117694855, + "step": 1500 + }, + { + "epoch": 0.093875, + "grad_norm": 2.84375, + "grad_norm_var": 0.09312744140625, + "learning_rate": 0.0001, + "loss": 7.9943, + "loss/crossentropy": 2.4143176078796387, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26304905116558075, + "step": 1502 + }, + { + "epoch": 0.094, + "grad_norm": 3.296875, + "grad_norm_var": 0.08209228515625, + "learning_rate": 0.0001, + "loss": 8.3215, + "loss/crossentropy": 1.9640471935272217, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.24391764402389526, + "step": 1504 + }, + { + "epoch": 0.094125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0652252197265625, + "learning_rate": 0.0001, + "loss": 8.204, + "loss/crossentropy": 2.405478358268738, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2646511495113373, + "step": 1506 + }, + { + "epoch": 0.09425, + "grad_norm": 2.796875, + "grad_norm_var": 0.046126302083333334, + "learning_rate": 0.0001, + "loss": 8.1211, + "loss/crossentropy": 2.0469033122062683, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2384241446852684, + "step": 1508 + }, + { + "epoch": 0.094375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0455230712890625, + "learning_rate": 0.0001, + "loss": 8.1709, + "loss/crossentropy": 2.2403076887130737, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2608010023832321, + "step": 1510 + }, + { + "epoch": 0.0945, + "grad_norm": 3.328125, + "grad_norm_var": 0.04589436848958333, + "learning_rate": 0.0001, + "loss": 8.1339, + "loss/crossentropy": 2.1499756574630737, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.271525114774704, + "step": 1512 + }, + { + "epoch": 0.094625, + "grad_norm": 2.734375, + "grad_norm_var": 0.04589436848958333, + "learning_rate": 0.0001, + "loss": 8.2903, + "loss/crossentropy": 2.1646158695220947, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2660486549139023, + "step": 1514 + }, + { + "epoch": 0.09475, + "grad_norm": 2.984375, + "grad_norm_var": 0.05085347493489583, + "learning_rate": 0.0001, + "loss": 7.9533, + "loss/crossentropy": 2.1994398832321167, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2655777484178543, + "step": 1516 + }, + { + "epoch": 0.094875, + "grad_norm": 3.171875, + "grad_norm_var": 0.047484334309895834, + "learning_rate": 0.0001, + "loss": 8.1863, + "loss/crossentropy": 2.0542885661125183, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2492573782801628, + "step": 1518 + }, + { + "epoch": 0.095, + "grad_norm": 2.84375, + "grad_norm_var": 0.046507771809895834, + "learning_rate": 0.0001, + "loss": 8.0486, + "loss/crossentropy": 2.382603883743286, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2706970274448395, + "step": 1520 + }, + { + "epoch": 0.095125, + "grad_norm": 3.0, + "grad_norm_var": 0.04146219889322917, + "learning_rate": 0.0001, + "loss": 8.288, + "loss/crossentropy": 2.2416555881500244, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26504258811473846, + "step": 1522 + }, + { + "epoch": 0.09525, + "grad_norm": 3.046875, + "grad_norm_var": 0.04072265625, + "learning_rate": 0.0001, + "loss": 8.3359, + "loss/crossentropy": 2.483952045440674, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.288076788187027, + "step": 1524 + }, + { + "epoch": 0.095375, + "grad_norm": 2.921875, + "grad_norm_var": 0.03912353515625, + "learning_rate": 0.0001, + "loss": 8.3248, + "loss/crossentropy": 2.6946524381637573, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2882222831249237, + "step": 1526 + }, + { + "epoch": 0.0955, + "grad_norm": 2.921875, + "grad_norm_var": 0.01900634765625, + "learning_rate": 0.0001, + "loss": 8.2016, + "loss/crossentropy": 1.9769355058670044, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2325659841299057, + "step": 1528 + }, + { + "epoch": 0.095625, + "grad_norm": 2.921875, + "grad_norm_var": 0.01513671875, + "learning_rate": 0.0001, + "loss": 8.2965, + "loss/crossentropy": 2.3487859964370728, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26721881330013275, + "step": 1530 + }, + { + "epoch": 0.09575, + "grad_norm": 3.59375, + "grad_norm_var": 0.039534505208333334, + "learning_rate": 0.0001, + "loss": 8.1814, + "loss/crossentropy": 2.2702786922454834, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2642149329185486, + "step": 1532 + }, + { + "epoch": 0.095875, + "grad_norm": 2.640625, + "grad_norm_var": 0.056962076822916666, + "learning_rate": 0.0001, + "loss": 8.2477, + "loss/crossentropy": 2.1052145957946777, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2681325525045395, + "step": 1534 + }, + { + "epoch": 0.096, + "grad_norm": 3.9375, + "grad_norm_var": 0.10074869791666667, + "learning_rate": 0.0001, + "loss": 8.158, + "loss/crossentropy": 2.356938362121582, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2789629250764847, + "step": 1536 + }, + { + "epoch": 0.096125, + "grad_norm": 2.671875, + "grad_norm_var": 0.11516825358072917, + "learning_rate": 0.0001, + "loss": 8.2291, + "loss/crossentropy": 2.1644341945648193, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2661993205547333, + "step": 1538 + }, + { + "epoch": 0.09625, + "grad_norm": 3.34375, + "grad_norm_var": 0.12259012858072917, + "learning_rate": 0.0001, + "loss": 8.1661, + "loss/crossentropy": 2.512505531311035, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.283734068274498, + "step": 1540 + }, + { + "epoch": 0.096375, + "grad_norm": 2.71875, + "grad_norm_var": 0.12873942057291668, + "learning_rate": 0.0001, + "loss": 8.1602, + "loss/crossentropy": 2.2558088302612305, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26760005950927734, + "step": 1542 + }, + { + "epoch": 0.0965, + "grad_norm": 3.0, + "grad_norm_var": 0.12698160807291667, + "learning_rate": 0.0001, + "loss": 8.1477, + "loss/crossentropy": 2.2201952934265137, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.26513203978538513, + "step": 1544 + }, + { + "epoch": 0.096625, + "grad_norm": 2.71875, + "grad_norm_var": 0.13869527180989583, + "learning_rate": 0.0001, + "loss": 8.1612, + "loss/crossentropy": 2.229737162590027, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2448047399520874, + "step": 1546 + }, + { + "epoch": 0.09675, + "grad_norm": 3.234375, + "grad_norm_var": 0.12079671223958334, + "learning_rate": 0.0001, + "loss": 8.1329, + "loss/crossentropy": 2.3728041648864746, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2712114453315735, + "step": 1548 + }, + { + "epoch": 0.096875, + "grad_norm": 3.0, + "grad_norm_var": 0.10236002604166666, + "learning_rate": 0.0001, + "loss": 8.2579, + "loss/crossentropy": 2.3911492824554443, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.28612037003040314, + "step": 1550 + }, + { + "epoch": 0.097, + "grad_norm": 3.21875, + "grad_norm_var": 0.054011027018229164, + "learning_rate": 0.0001, + "loss": 8.3094, + "loss/crossentropy": 2.3950345516204834, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.28530459105968475, + "step": 1552 + }, + { + "epoch": 0.097125, + "grad_norm": 2.96875, + "grad_norm_var": 0.051656087239583336, + "learning_rate": 0.0001, + "loss": 8.228, + "loss/crossentropy": 2.577568531036377, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26758988201618195, + "step": 1554 + }, + { + "epoch": 0.09725, + "grad_norm": 2.875, + "grad_norm_var": 0.0450592041015625, + "learning_rate": 0.0001, + "loss": 8.2982, + "loss/crossentropy": 2.4208312034606934, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2721874862909317, + "step": 1556 + }, + { + "epoch": 0.097375, + "grad_norm": 2.828125, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 8.1512, + "loss/crossentropy": 2.311411142349243, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2621803656220436, + "step": 1558 + }, + { + "epoch": 0.0975, + "grad_norm": 3.015625, + "grad_norm_var": 0.036295572916666664, + "learning_rate": 0.0001, + "loss": 8.278, + "loss/crossentropy": 2.180152475833893, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.255389466881752, + "step": 1560 + }, + { + "epoch": 0.097625, + "grad_norm": 2.953125, + "grad_norm_var": 0.027587890625, + "learning_rate": 0.0001, + "loss": 8.3438, + "loss/crossentropy": 2.5294106006622314, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26509322226047516, + "step": 1562 + }, + { + "epoch": 0.09775, + "grad_norm": 3.03125, + "grad_norm_var": 0.027099609375, + "learning_rate": 0.0001, + "loss": 8.2816, + "loss/crossentropy": 2.1683244705200195, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2431202381849289, + "step": 1564 + }, + { + "epoch": 0.097875, + "grad_norm": 3.265625, + "grad_norm_var": 0.029686482747395833, + "learning_rate": 0.0001, + "loss": 8.2192, + "loss/crossentropy": 2.2188292741775513, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.29626937210559845, + "step": 1566 + }, + { + "epoch": 0.098, + "grad_norm": 2.8125, + "grad_norm_var": 0.0247955322265625, + "learning_rate": 0.0001, + "loss": 8.2902, + "loss/crossentropy": 2.364318370819092, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.26975926756858826, + "step": 1568 + }, + { + "epoch": 0.098125, + "grad_norm": 3.0625, + "grad_norm_var": 0.02056884765625, + "learning_rate": 0.0001, + "loss": 8.3332, + "loss/crossentropy": 2.54610013961792, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28531205654144287, + "step": 1570 + }, + { + "epoch": 0.09825, + "grad_norm": 3.09375, + "grad_norm_var": 0.018659464518229165, + "learning_rate": 0.0001, + "loss": 8.2105, + "loss/crossentropy": 2.206403374671936, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.24771679937839508, + "step": 1572 + }, + { + "epoch": 0.098375, + "grad_norm": 3.078125, + "grad_norm_var": 0.015913899739583334, + "learning_rate": 0.0001, + "loss": 8.3768, + "loss/crossentropy": 2.3607594966888428, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.28001710772514343, + "step": 1574 + }, + { + "epoch": 0.0985, + "grad_norm": 3.15625, + "grad_norm_var": 0.021581013997395832, + "learning_rate": 0.0001, + "loss": 8.2712, + "loss/crossentropy": 2.2735308408737183, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.26137876510620117, + "step": 1576 + }, + { + "epoch": 0.098625, + "grad_norm": 2.734375, + "grad_norm_var": 0.027912394205729166, + "learning_rate": 0.0001, + "loss": 8.2123, + "loss/crossentropy": 2.3609447479248047, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26791180670261383, + "step": 1578 + }, + { + "epoch": 0.09875, + "grad_norm": 3.234375, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 0.0001, + "loss": 8.3814, + "loss/crossentropy": 2.4629390239715576, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.29713912308216095, + "step": 1580 + }, + { + "epoch": 0.098875, + "grad_norm": 3.171875, + "grad_norm_var": 0.028902180989583335, + "learning_rate": 0.0001, + "loss": 8.3342, + "loss/crossentropy": 2.4477301836013794, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2905379682779312, + "step": 1582 + }, + { + "epoch": 0.099, + "grad_norm": 2.84375, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 8.1553, + "loss/crossentropy": 2.310616612434387, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.28200674057006836, + "step": 1584 + }, + { + "epoch": 0.099125, + "grad_norm": 2.984375, + "grad_norm_var": 0.02744140625, + "learning_rate": 0.0001, + "loss": 8.202, + "loss/crossentropy": 2.056865870952606, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2651172876358032, + "step": 1586 + }, + { + "epoch": 0.09925, + "grad_norm": 3.546875, + "grad_norm_var": 0.04753316243489583, + "learning_rate": 0.0001, + "loss": 8.3285, + "loss/crossentropy": 2.222190737724304, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2600644379854202, + "step": 1588 + }, + { + "epoch": 0.099375, + "grad_norm": 3.03125, + "grad_norm_var": 0.05275777180989583, + "learning_rate": 0.0001, + "loss": 8.3277, + "loss/crossentropy": 2.499345541000366, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.27901938557624817, + "step": 1590 + }, + { + "epoch": 0.0995, + "grad_norm": 3.09375, + "grad_norm_var": 0.04485270182291667, + "learning_rate": 0.0001, + "loss": 8.306, + "loss/crossentropy": 2.4675090312957764, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27757471799850464, + "step": 1592 + }, + { + "epoch": 0.099625, + "grad_norm": 3.078125, + "grad_norm_var": 0.03870035807291667, + "learning_rate": 0.0001, + "loss": 8.1512, + "loss/crossentropy": 2.0948686599731445, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.24000447988510132, + "step": 1594 + }, + { + "epoch": 0.09975, + "grad_norm": 3.078125, + "grad_norm_var": 0.0365386962890625, + "learning_rate": 0.0001, + "loss": 8.2796, + "loss/crossentropy": 2.2303179502487183, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.25618939101696014, + "step": 1596 + }, + { + "epoch": 0.099875, + "grad_norm": 2.78125, + "grad_norm_var": 0.03728841145833333, + "learning_rate": 0.0001, + "loss": 8.2309, + "loss/crossentropy": 2.172394037246704, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26014433801174164, + "step": 1598 + }, + { + "epoch": 0.1, + "grad_norm": 3.0, + "grad_norm_var": 0.0431640625, + "learning_rate": 0.0001, + "loss": 8.0142, + "loss/crossentropy": 2.0257323384284973, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.23609444499015808, + "step": 1600 + }, + { + "epoch": 0.100125, + "grad_norm": 2.8125, + "grad_norm_var": 0.044514973958333336, + "learning_rate": 0.0001, + "loss": 8.0713, + "loss/crossentropy": 2.1410731077194214, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26623016595840454, + "step": 1602 + }, + { + "epoch": 0.10025, + "grad_norm": 2.984375, + "grad_norm_var": 0.021491495768229167, + "learning_rate": 0.0001, + "loss": 8.0494, + "loss/crossentropy": 2.3695082664489746, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2746554762125015, + "step": 1604 + }, + { + "epoch": 0.100375, + "grad_norm": 2.90625, + "grad_norm_var": 0.021637980143229166, + "learning_rate": 0.0001, + "loss": 8.1369, + "loss/crossentropy": 2.2459890842437744, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25808124244213104, + "step": 1606 + }, + { + "epoch": 0.1005, + "grad_norm": 3.25, + "grad_norm_var": 0.023591105143229166, + "learning_rate": 0.0001, + "loss": 8.3249, + "loss/crossentropy": 2.4524621963500977, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2866296321153641, + "step": 1608 + }, + { + "epoch": 0.100625, + "grad_norm": 3.0, + "grad_norm_var": 0.023128255208333334, + "learning_rate": 0.0001, + "loss": 8.2293, + "loss/crossentropy": 2.3052438497543335, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.27561667561531067, + "step": 1610 + }, + { + "epoch": 0.10075, + "grad_norm": 2.71875, + "grad_norm_var": 0.021174112955729168, + "learning_rate": 0.0001, + "loss": 8.1306, + "loss/crossentropy": 2.477377772331238, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.26571913063526154, + "step": 1612 + }, + { + "epoch": 0.100875, + "grad_norm": 2.90625, + "grad_norm_var": 0.1718414306640625, + "learning_rate": 0.0001, + "loss": 8.4299, + "loss/crossentropy": 2.2197115421295166, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2796938568353653, + "step": 1614 + }, + { + "epoch": 0.101, + "grad_norm": 3.078125, + "grad_norm_var": 0.17489827473958333, + "learning_rate": 0.0001, + "loss": 8.2917, + "loss/crossentropy": 2.2831382751464844, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2742185890674591, + "step": 1616 + }, + { + "epoch": 0.101125, + "grad_norm": 3.0625, + "grad_norm_var": 0.17009989420572916, + "learning_rate": 0.0001, + "loss": 8.2352, + "loss/crossentropy": 2.2610585689544678, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26356393098831177, + "step": 1618 + }, + { + "epoch": 0.10125, + "grad_norm": 3.046875, + "grad_norm_var": 0.16988525390625, + "learning_rate": 0.0001, + "loss": 8.2457, + "loss/crossentropy": 2.2453945875167847, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2588353157043457, + "step": 1620 + }, + { + "epoch": 0.101375, + "grad_norm": 3.203125, + "grad_norm_var": 0.16035054524739584, + "learning_rate": 0.0001, + "loss": 8.2079, + "loss/crossentropy": 2.2290210723876953, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.27493180334568024, + "step": 1622 + }, + { + "epoch": 0.1015, + "grad_norm": 3.0, + "grad_norm_var": 0.15942281087239582, + "learning_rate": 0.0001, + "loss": 8.3946, + "loss/crossentropy": 2.3389216661453247, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2627445012331009, + "step": 1624 + }, + { + "epoch": 0.101625, + "grad_norm": 3.625, + "grad_norm_var": 0.32203776041666665, + "learning_rate": 0.0001, + "loss": 8.2128, + "loss/crossentropy": 2.193161904811859, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2609563320875168, + "step": 1626 + }, + { + "epoch": 0.10175, + "grad_norm": 2.9375, + "grad_norm_var": 0.29754231770833334, + "learning_rate": 0.0001, + "loss": 8.2789, + "loss/crossentropy": 2.4037156105041504, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2840966284275055, + "step": 1628 + }, + { + "epoch": 0.101875, + "grad_norm": 14.4375, + "grad_norm_var": 8.05601298014323, + "learning_rate": 0.0001, + "loss": 8.7256, + "loss/crossentropy": 2.2983932495117188, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2895353436470032, + "step": 1630 + }, + { + "epoch": 0.102, + "grad_norm": 3.359375, + "grad_norm_var": 8.257710774739584, + "learning_rate": 0.0001, + "loss": 8.5242, + "loss/crossentropy": 2.5548095703125, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.29210618138313293, + "step": 1632 + }, + { + "epoch": 0.102125, + "grad_norm": 3.015625, + "grad_norm_var": 8.2908203125, + "learning_rate": 0.0001, + "loss": 8.2313, + "loss/crossentropy": 2.1125290393829346, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26304440200328827, + "step": 1634 + }, + { + "epoch": 0.10225, + "grad_norm": 3.0, + "grad_norm_var": 8.321890258789063, + "learning_rate": 0.0001, + "loss": 8.3129, + "loss/crossentropy": 2.4635796546936035, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.289856493473053, + "step": 1636 + }, + { + "epoch": 0.102375, + "grad_norm": 3.359375, + "grad_norm_var": 8.303043619791667, + "learning_rate": 0.0001, + "loss": 8.3342, + "loss/crossentropy": 2.4671066999435425, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.30989140272140503, + "step": 1638 + }, + { + "epoch": 0.1025, + "grad_norm": 2.859375, + "grad_norm_var": 8.330631510416667, + "learning_rate": 0.0001, + "loss": 8.3266, + "loss/crossentropy": 2.2032480239868164, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.28856223821640015, + "step": 1640 + }, + { + "epoch": 0.102625, + "grad_norm": 2.875, + "grad_norm_var": 8.449762980143229, + "learning_rate": 0.0001, + "loss": 8.0619, + "loss/crossentropy": 2.384071946144104, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.27716881036758423, + "step": 1642 + }, + { + "epoch": 0.10275, + "grad_norm": 4.25, + "grad_norm_var": 8.37940165201823, + "learning_rate": 0.0001, + "loss": 8.2985, + "loss/crossentropy": 2.394433617591858, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27403971552848816, + "step": 1644 + }, + { + "epoch": 0.102875, + "grad_norm": 3.0, + "grad_norm_var": 0.6259073893229167, + "learning_rate": 0.0001, + "loss": 8.1275, + "loss/crossentropy": 2.5770705938339233, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2827526032924652, + "step": 1646 + }, + { + "epoch": 0.103, + "grad_norm": 2.9375, + "grad_norm_var": 0.11741434733072917, + "learning_rate": 0.0001, + "loss": 8.1994, + "loss/crossentropy": 2.203721523284912, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2632629871368408, + "step": 1648 + }, + { + "epoch": 0.103125, + "grad_norm": 2.78125, + "grad_norm_var": 0.11988525390625, + "learning_rate": 0.0001, + "loss": 8.27, + "loss/crossentropy": 2.190592408180237, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2594703510403633, + "step": 1650 + }, + { + "epoch": 0.10325, + "grad_norm": 3.109375, + "grad_norm_var": 0.122119140625, + "learning_rate": 0.0001, + "loss": 8.3436, + "loss/crossentropy": 2.3094884157180786, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2539513558149338, + "step": 1652 + }, + { + "epoch": 0.103375, + "grad_norm": 2.8125, + "grad_norm_var": 0.12323811848958334, + "learning_rate": 0.0001, + "loss": 8.2993, + "loss/crossentropy": 2.2542308568954468, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27265970408916473, + "step": 1654 + }, + { + "epoch": 0.1035, + "grad_norm": 3.65625, + "grad_norm_var": 0.14322916666666666, + "learning_rate": 0.0001, + "loss": 8.2358, + "loss/crossentropy": 2.150593101978302, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26226024329662323, + "step": 1656 + }, + { + "epoch": 0.103625, + "grad_norm": 2.796875, + "grad_norm_var": 0.15575764973958334, + "learning_rate": 0.0001, + "loss": 8.2424, + "loss/crossentropy": 2.3359317779541016, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2621028572320938, + "step": 1658 + }, + { + "epoch": 0.10375, + "grad_norm": 3.125, + "grad_norm_var": 0.07243550618489583, + "learning_rate": 0.0001, + "loss": 8.209, + "loss/crossentropy": 2.3354294300079346, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28642134368419647, + "step": 1660 + }, + { + "epoch": 0.103875, + "grad_norm": 2.890625, + "grad_norm_var": 0.07413736979166667, + "learning_rate": 0.0001, + "loss": 8.0643, + "loss/crossentropy": 2.2628813982009888, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2720167338848114, + "step": 1662 + }, + { + "epoch": 0.104, + "grad_norm": 2.640625, + "grad_norm_var": 0.08414713541666667, + "learning_rate": 0.0001, + "loss": 8.2252, + "loss/crossentropy": 2.405531644821167, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2855593413114548, + "step": 1664 + }, + { + "epoch": 0.104125, + "grad_norm": 3.453125, + "grad_norm_var": 0.09845377604166666, + "learning_rate": 0.0001, + "loss": 8.2726, + "loss/crossentropy": 2.348948836326599, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2590363919734955, + "step": 1666 + }, + { + "epoch": 0.10425, + "grad_norm": 2.6875, + "grad_norm_var": 0.108740234375, + "learning_rate": 0.0001, + "loss": 8.0444, + "loss/crossentropy": 2.275284171104431, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2520073354244232, + "step": 1668 + }, + { + "epoch": 0.104375, + "grad_norm": 3.171875, + "grad_norm_var": 0.10839436848958334, + "learning_rate": 0.0001, + "loss": 7.9805, + "loss/crossentropy": 2.160663425922394, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.25372669100761414, + "step": 1670 + }, + { + "epoch": 0.1045, + "grad_norm": 3.1875, + "grad_norm_var": 0.0904205322265625, + "learning_rate": 0.0001, + "loss": 8.2607, + "loss/crossentropy": 2.2359933853149414, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2621243894100189, + "step": 1672 + }, + { + "epoch": 0.104625, + "grad_norm": 2.78125, + "grad_norm_var": 0.06806233723958334, + "learning_rate": 0.0001, + "loss": 7.9523, + "loss/crossentropy": 1.9437886476516724, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2611730396747589, + "step": 1674 + }, + { + "epoch": 0.10475, + "grad_norm": 2.859375, + "grad_norm_var": 0.06712137858072917, + "learning_rate": 0.0001, + "loss": 8.0401, + "loss/crossentropy": 2.288792371749878, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2550469785928726, + "step": 1676 + }, + { + "epoch": 0.104875, + "grad_norm": 2.96875, + "grad_norm_var": 0.07136128743489584, + "learning_rate": 0.0001, + "loss": 8.2327, + "loss/crossentropy": 2.1720248460769653, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24942373484373093, + "step": 1678 + }, + { + "epoch": 0.105, + "grad_norm": 2.65625, + "grad_norm_var": 0.06942952473958333, + "learning_rate": 0.0001, + "loss": 8.2368, + "loss/crossentropy": 2.2941821813583374, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2739366888999939, + "step": 1680 + }, + { + "epoch": 0.105125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0464508056640625, + "learning_rate": 0.0001, + "loss": 8.234, + "loss/crossentropy": 2.255189538002014, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2702263593673706, + "step": 1682 + }, + { + "epoch": 0.10525, + "grad_norm": 3.078125, + "grad_norm_var": 0.0395660400390625, + "learning_rate": 0.0001, + "loss": 8.1081, + "loss/crossentropy": 2.419864535331726, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26446475088596344, + "step": 1684 + }, + { + "epoch": 0.105375, + "grad_norm": 2.734375, + "grad_norm_var": 0.041559855143229164, + "learning_rate": 0.0001, + "loss": 8.3004, + "loss/crossentropy": 2.329113721847534, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2699923515319824, + "step": 1686 + }, + { + "epoch": 0.1055, + "grad_norm": 2.890625, + "grad_norm_var": 0.02672119140625, + "learning_rate": 0.0001, + "loss": 8.0978, + "loss/crossentropy": 2.290674090385437, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2571800425648689, + "step": 1688 + }, + { + "epoch": 0.105625, + "grad_norm": 2.703125, + "grad_norm_var": 0.031787109375, + "learning_rate": 0.0001, + "loss": 7.992, + "loss/crossentropy": 2.194283127784729, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2518819496035576, + "step": 1690 + }, + { + "epoch": 0.10575, + "grad_norm": 3.625, + "grad_norm_var": 0.06621805826822917, + "learning_rate": 0.0001, + "loss": 8.0845, + "loss/crossentropy": 2.3665404319763184, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2546495646238327, + "step": 1692 + }, + { + "epoch": 0.105875, + "grad_norm": 2.796875, + "grad_norm_var": 0.06741536458333333, + "learning_rate": 0.0001, + "loss": 8.2148, + "loss/crossentropy": 2.6665724515914917, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26855309307575226, + "step": 1694 + }, + { + "epoch": 0.106, + "grad_norm": 2.59375, + "grad_norm_var": 0.06809488932291667, + "learning_rate": 0.0001, + "loss": 8.0213, + "loss/crossentropy": 2.0011618733406067, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2479224056005478, + "step": 1696 + }, + { + "epoch": 0.106125, + "grad_norm": 3.0, + "grad_norm_var": 0.06813151041666667, + "learning_rate": 0.0001, + "loss": 8.1117, + "loss/crossentropy": 2.233310341835022, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26155102252960205, + "step": 1698 + }, + { + "epoch": 0.10625, + "grad_norm": 3.078125, + "grad_norm_var": 0.067138671875, + "learning_rate": 0.0001, + "loss": 8.1435, + "loss/crossentropy": 2.023264706134796, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.23082198202610016, + "step": 1700 + }, + { + "epoch": 0.106375, + "grad_norm": 2.984375, + "grad_norm_var": 0.06722005208333333, + "learning_rate": 0.0001, + "loss": 8.0233, + "loss/crossentropy": 2.2503018379211426, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24713517725467682, + "step": 1702 + }, + { + "epoch": 0.1065, + "grad_norm": 2.796875, + "grad_norm_var": 0.07294820149739584, + "learning_rate": 0.0001, + "loss": 7.9907, + "loss/crossentropy": 2.557657241821289, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.26641707122325897, + "step": 1704 + }, + { + "epoch": 0.106625, + "grad_norm": 2.78125, + "grad_norm_var": 0.06492411295572917, + "learning_rate": 0.0001, + "loss": 7.9908, + "loss/crossentropy": 2.2161107063293457, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2457757443189621, + "step": 1706 + }, + { + "epoch": 0.10675, + "grad_norm": 3.15625, + "grad_norm_var": 0.041047159830729166, + "learning_rate": 0.0001, + "loss": 8.3599, + "loss/crossentropy": 2.307586431503296, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.25822295993566513, + "step": 1708 + }, + { + "epoch": 0.106875, + "grad_norm": 2.875, + "grad_norm_var": 0.03759765625, + "learning_rate": 0.0001, + "loss": 8.1595, + "loss/crossentropy": 2.215519666671753, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2511584535241127, + "step": 1710 + }, + { + "epoch": 0.107, + "grad_norm": 3.125, + "grad_norm_var": 0.03365478515625, + "learning_rate": 0.0001, + "loss": 8.3037, + "loss/crossentropy": 2.292387008666992, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2549327313899994, + "step": 1712 + }, + { + "epoch": 0.107125, + "grad_norm": 2.9375, + "grad_norm_var": 0.03853251139322917, + "learning_rate": 0.0001, + "loss": 8.1552, + "loss/crossentropy": 2.573517322540283, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27707283198833466, + "step": 1714 + }, + { + "epoch": 0.10725, + "grad_norm": 3.0625, + "grad_norm_var": 0.03942057291666667, + "learning_rate": 0.0001, + "loss": 8.1879, + "loss/crossentropy": 2.4223859310150146, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26580674946308136, + "step": 1716 + }, + { + "epoch": 0.107375, + "grad_norm": 2.828125, + "grad_norm_var": 0.03720296223958333, + "learning_rate": 0.0001, + "loss": 8.2149, + "loss/crossentropy": 2.5507869720458984, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2808096259832382, + "step": 1718 + }, + { + "epoch": 0.1075, + "grad_norm": 5.46875, + "grad_norm_var": 0.42760009765625, + "learning_rate": 0.0001, + "loss": 8.2746, + "loss/crossentropy": 2.4066158533096313, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27704988420009613, + "step": 1720 + }, + { + "epoch": 0.107625, + "grad_norm": 3.1875, + "grad_norm_var": 0.42789306640625, + "learning_rate": 0.0001, + "loss": 8.1194, + "loss/crossentropy": 2.3192564249038696, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2466306835412979, + "step": 1722 + }, + { + "epoch": 0.10775, + "grad_norm": 2.828125, + "grad_norm_var": 0.4310943603515625, + "learning_rate": 0.0001, + "loss": 8.1758, + "loss/crossentropy": 2.2524945735931396, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26875488460063934, + "step": 1724 + }, + { + "epoch": 0.107875, + "grad_norm": 2.921875, + "grad_norm_var": 0.4318359375, + "learning_rate": 0.0001, + "loss": 8.27, + "loss/crossentropy": 2.2209118604660034, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.25894972681999207, + "step": 1726 + }, + { + "epoch": 0.108, + "grad_norm": 3.0625, + "grad_norm_var": 0.43041890462239585, + "learning_rate": 0.0001, + "loss": 8.2942, + "loss/crossentropy": 2.156678080558777, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.24751365184783936, + "step": 1728 + }, + { + "epoch": 0.108125, + "grad_norm": 2.96875, + "grad_norm_var": 0.42939453125, + "learning_rate": 0.0001, + "loss": 7.9607, + "loss/crossentropy": 2.1267510652542114, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24761785566806793, + "step": 1730 + }, + { + "epoch": 0.10825, + "grad_norm": 2.6875, + "grad_norm_var": 0.4383941650390625, + "learning_rate": 0.0001, + "loss": 7.9771, + "loss/crossentropy": 2.300544857978821, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.23868989944458008, + "step": 1732 + }, + { + "epoch": 0.108375, + "grad_norm": 2.890625, + "grad_norm_var": 0.45806884765625, + "learning_rate": 0.0001, + "loss": 8.1917, + "loss/crossentropy": 2.360519051551819, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26660336554050446, + "step": 1734 + }, + { + "epoch": 0.1085, + "grad_norm": 2.859375, + "grad_norm_var": 0.06379292805989584, + "learning_rate": 0.0001, + "loss": 8.1922, + "loss/crossentropy": 2.1705552339553833, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2528446614742279, + "step": 1736 + }, + { + "epoch": 0.108625, + "grad_norm": 3.0625, + "grad_norm_var": 0.034357706705729164, + "learning_rate": 0.0001, + "loss": 8.1198, + "loss/crossentropy": 2.299151659011841, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.25353947281837463, + "step": 1738 + }, + { + "epoch": 0.10875, + "grad_norm": 2.53125, + "grad_norm_var": 0.045210774739583334, + "learning_rate": 0.0001, + "loss": 8.2351, + "loss/crossentropy": 2.172037899494171, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2686986029148102, + "step": 1740 + }, + { + "epoch": 0.108875, + "grad_norm": 3.3125, + "grad_norm_var": 0.0550201416015625, + "learning_rate": 0.0001, + "loss": 8.0653, + "loss/crossentropy": 2.3453006744384766, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2615263909101486, + "step": 1742 + }, + { + "epoch": 0.109, + "grad_norm": 2.59375, + "grad_norm_var": 0.06112874348958333, + "learning_rate": 0.0001, + "loss": 8.0488, + "loss/crossentropy": 2.4043914079666138, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2809004932641983, + "step": 1744 + }, + { + "epoch": 0.109125, + "grad_norm": 3.296875, + "grad_norm_var": 0.06349995930989584, + "learning_rate": 0.0001, + "loss": 8.3458, + "loss/crossentropy": 2.3624587059020996, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.29849672317504883, + "step": 1746 + }, + { + "epoch": 0.10925, + "grad_norm": 3.25, + "grad_norm_var": 0.06689453125, + "learning_rate": 0.0001, + "loss": 8.2422, + "loss/crossentropy": 2.299923300743103, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2804667204618454, + "step": 1748 + }, + { + "epoch": 0.109375, + "grad_norm": 2.625, + "grad_norm_var": 0.06067301432291667, + "learning_rate": 0.0001, + "loss": 8.0319, + "loss/crossentropy": 2.017127275466919, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.236178919672966, + "step": 1750 + }, + { + "epoch": 0.1095, + "grad_norm": 2.875, + "grad_norm_var": 0.05732014973958333, + "learning_rate": 0.0001, + "loss": 8.2062, + "loss/crossentropy": 2.665374517440796, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.28587816655635834, + "step": 1752 + }, + { + "epoch": 0.109625, + "grad_norm": 2.609375, + "grad_norm_var": 0.07112528483072916, + "learning_rate": 0.0001, + "loss": 8.129, + "loss/crossentropy": 2.2756296396255493, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2523636817932129, + "step": 1754 + }, + { + "epoch": 0.10975, + "grad_norm": 2.765625, + "grad_norm_var": 0.058259073893229166, + "learning_rate": 0.0001, + "loss": 7.9899, + "loss/crossentropy": 2.066028356552124, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23234106600284576, + "step": 1756 + }, + { + "epoch": 0.109875, + "grad_norm": 2.796875, + "grad_norm_var": 0.05266520182291667, + "learning_rate": 0.0001, + "loss": 8.1501, + "loss/crossentropy": 2.307652711868286, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26358823478221893, + "step": 1758 + }, + { + "epoch": 0.11, + "grad_norm": 2.84375, + "grad_norm_var": 0.0484375, + "learning_rate": 0.0001, + "loss": 8.0521, + "loss/crossentropy": 2.00510311126709, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2376161813735962, + "step": 1760 + }, + { + "epoch": 0.110125, + "grad_norm": 2.765625, + "grad_norm_var": 0.04407450358072917, + "learning_rate": 0.0001, + "loss": 8.1974, + "loss/crossentropy": 2.4039831161499023, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27686847746372223, + "step": 1762 + }, + { + "epoch": 0.11025, + "grad_norm": 2.8125, + "grad_norm_var": 0.03303934733072917, + "learning_rate": 0.0001, + "loss": 8.0579, + "loss/crossentropy": 2.3060104846954346, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2945534288883209, + "step": 1764 + }, + { + "epoch": 0.110375, + "grad_norm": 3.40625, + "grad_norm_var": 0.050047810872395834, + "learning_rate": 0.0001, + "loss": 8.0388, + "loss/crossentropy": 2.4445682764053345, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.27699966728687286, + "step": 1766 + }, + { + "epoch": 0.1105, + "grad_norm": 2.75, + "grad_norm_var": 0.05078837076822917, + "learning_rate": 0.0001, + "loss": 8.2157, + "loss/crossentropy": 2.4216455221176147, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.28081244230270386, + "step": 1768 + }, + { + "epoch": 0.110625, + "grad_norm": 2.859375, + "grad_norm_var": 0.035521443684895834, + "learning_rate": 0.0001, + "loss": 8.0611, + "loss/crossentropy": 1.7756622433662415, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2289479374885559, + "step": 1770 + }, + { + "epoch": 0.11075, + "grad_norm": 2.9375, + "grad_norm_var": 0.03371480305989583, + "learning_rate": 0.0001, + "loss": 7.9772, + "loss/crossentropy": 2.1726499795913696, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2653361111879349, + "step": 1772 + }, + { + "epoch": 0.110875, + "grad_norm": 2.984375, + "grad_norm_var": 0.034016927083333336, + "learning_rate": 0.0001, + "loss": 8.0615, + "loss/crossentropy": 2.3147062063217163, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27281494438648224, + "step": 1774 + }, + { + "epoch": 0.111, + "grad_norm": 3.09375, + "grad_norm_var": 0.0338775634765625, + "learning_rate": 0.0001, + "loss": 8.1474, + "loss/crossentropy": 2.1700609922409058, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.25692617893218994, + "step": 1776 + }, + { + "epoch": 0.111125, + "grad_norm": 2.8125, + "grad_norm_var": 0.0308990478515625, + "learning_rate": 0.0001, + "loss": 8.0753, + "loss/crossentropy": 2.114788770675659, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2623438090085983, + "step": 1778 + }, + { + "epoch": 0.11125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03291727701822917, + "learning_rate": 0.0001, + "loss": 8.232, + "loss/crossentropy": 2.3088366985321045, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25789759308099747, + "step": 1780 + }, + { + "epoch": 0.111375, + "grad_norm": 2.828125, + "grad_norm_var": 0.0161773681640625, + "learning_rate": 0.0001, + "loss": 8.1283, + "loss/crossentropy": 2.021032750606537, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26125916838645935, + "step": 1782 + }, + { + "epoch": 0.1115, + "grad_norm": 2.8125, + "grad_norm_var": 0.016520182291666668, + "learning_rate": 0.0001, + "loss": 8.3159, + "loss/crossentropy": 2.546655535697937, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26530279219150543, + "step": 1784 + }, + { + "epoch": 0.111625, + "grad_norm": 2.703125, + "grad_norm_var": 0.018094889322916665, + "learning_rate": 0.0001, + "loss": 8.0182, + "loss/crossentropy": 2.1505188941955566, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23876308649778366, + "step": 1786 + }, + { + "epoch": 0.11175, + "grad_norm": 2.875, + "grad_norm_var": 0.019880167643229165, + "learning_rate": 0.0001, + "loss": 8.0242, + "loss/crossentropy": 2.1757689714431763, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2557087540626526, + "step": 1788 + }, + { + "epoch": 0.111875, + "grad_norm": 2.953125, + "grad_norm_var": 0.01763916015625, + "learning_rate": 0.0001, + "loss": 8.0262, + "loss/crossentropy": 2.28451144695282, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26241403818130493, + "step": 1790 + }, + { + "epoch": 0.112, + "grad_norm": 2.765625, + "grad_norm_var": 0.016145833333333335, + "learning_rate": 0.0001, + "loss": 8.0235, + "loss/crossentropy": 2.11034619808197, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2549893856048584, + "step": 1792 + }, + { + "epoch": 0.112125, + "grad_norm": 2.828125, + "grad_norm_var": 0.014969889322916667, + "learning_rate": 0.0001, + "loss": 8.076, + "loss/crossentropy": 2.0472227931022644, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2562412843108177, + "step": 1794 + }, + { + "epoch": 0.11225, + "grad_norm": 2.796875, + "grad_norm_var": 0.0120758056640625, + "learning_rate": 0.0001, + "loss": 7.928, + "loss/crossentropy": 2.315675735473633, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2601415067911148, + "step": 1796 + }, + { + "epoch": 0.112375, + "grad_norm": 2.953125, + "grad_norm_var": 0.022443644205729165, + "learning_rate": 0.0001, + "loss": 7.8579, + "loss/crossentropy": 2.0402532815933228, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2341025322675705, + "step": 1798 + }, + { + "epoch": 0.1125, + "grad_norm": 2.78125, + "grad_norm_var": 0.02135009765625, + "learning_rate": 0.0001, + "loss": 8.2218, + "loss/crossentropy": 2.562678098678589, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2757340967655182, + "step": 1800 + }, + { + "epoch": 0.112625, + "grad_norm": 3.140625, + "grad_norm_var": 0.024657185872395834, + "learning_rate": 0.0001, + "loss": 8.1885, + "loss/crossentropy": 2.0969003438949585, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2326880842447281, + "step": 1802 + }, + { + "epoch": 0.11275, + "grad_norm": 2.734375, + "grad_norm_var": 0.030720011393229166, + "learning_rate": 0.0001, + "loss": 8.1163, + "loss/crossentropy": 2.332270383834839, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25439298152923584, + "step": 1804 + }, + { + "epoch": 0.112875, + "grad_norm": 3.125, + "grad_norm_var": 0.03774312337239583, + "learning_rate": 0.0001, + "loss": 8.106, + "loss/crossentropy": 2.162129521369934, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26606719195842743, + "step": 1806 + }, + { + "epoch": 0.113, + "grad_norm": 3.109375, + "grad_norm_var": 0.0377593994140625, + "learning_rate": 0.0001, + "loss": 8.1417, + "loss/crossentropy": 2.2731558084487915, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2553500384092331, + "step": 1808 + }, + { + "epoch": 0.113125, + "grad_norm": 2.6875, + "grad_norm_var": 0.0411529541015625, + "learning_rate": 0.0001, + "loss": 8.0807, + "loss/crossentropy": 2.10029274225235, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23542343080043793, + "step": 1810 + }, + { + "epoch": 0.11325, + "grad_norm": 2.875, + "grad_norm_var": 0.04257405598958333, + "learning_rate": 0.0001, + "loss": 8.1634, + "loss/crossentropy": 2.426058769226074, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2891880124807358, + "step": 1812 + }, + { + "epoch": 0.113375, + "grad_norm": 3.0625, + "grad_norm_var": 0.033154296875, + "learning_rate": 0.0001, + "loss": 8.0769, + "loss/crossentropy": 2.4051181077957153, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2494017630815506, + "step": 1814 + }, + { + "epoch": 0.1135, + "grad_norm": 2.671875, + "grad_norm_var": 0.03585611979166667, + "learning_rate": 0.0001, + "loss": 8.0919, + "loss/crossentropy": 2.0648642778396606, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23387089371681213, + "step": 1816 + }, + { + "epoch": 0.113625, + "grad_norm": 3.25, + "grad_norm_var": 0.0592681884765625, + "learning_rate": 0.0001, + "loss": 8.078, + "loss/crossentropy": 1.989893615245819, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.23796076327562332, + "step": 1818 + }, + { + "epoch": 0.11375, + "grad_norm": 2.828125, + "grad_norm_var": 0.0541656494140625, + "learning_rate": 0.0001, + "loss": 7.9998, + "loss/crossentropy": 1.9235325455665588, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.22104668617248535, + "step": 1820 + }, + { + "epoch": 0.113875, + "grad_norm": 2.828125, + "grad_norm_var": 0.05319010416666667, + "learning_rate": 0.0001, + "loss": 8.0283, + "loss/crossentropy": 2.39365816116333, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.25356078147888184, + "step": 1822 + }, + { + "epoch": 0.114, + "grad_norm": 2.765625, + "grad_norm_var": 0.05628153483072917, + "learning_rate": 0.0001, + "loss": 8.0969, + "loss/crossentropy": 2.2098069190979004, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2322394847869873, + "step": 1824 + }, + { + "epoch": 0.114125, + "grad_norm": 2.59375, + "grad_norm_var": 0.05894266764322917, + "learning_rate": 0.0001, + "loss": 8.0474, + "loss/crossentropy": 2.3926165103912354, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26291845738887787, + "step": 1826 + }, + { + "epoch": 0.11425, + "grad_norm": 2.90625, + "grad_norm_var": 0.057494099934895834, + "learning_rate": 0.0001, + "loss": 8.0912, + "loss/crossentropy": 2.271665573120117, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.28214675188064575, + "step": 1828 + }, + { + "epoch": 0.114375, + "grad_norm": 2.96875, + "grad_norm_var": 0.055826822916666664, + "learning_rate": 0.0001, + "loss": 8.2937, + "loss/crossentropy": 2.3243162631988525, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25888554751873016, + "step": 1830 + }, + { + "epoch": 0.1145, + "grad_norm": 2.828125, + "grad_norm_var": 0.05383707682291667, + "learning_rate": 0.0001, + "loss": 8.158, + "loss/crossentropy": 2.4674028158187866, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2678636610507965, + "step": 1832 + }, + { + "epoch": 0.114625, + "grad_norm": 2.90625, + "grad_norm_var": 0.023795572916666667, + "learning_rate": 0.0001, + "loss": 8.2065, + "loss/crossentropy": 2.2178725004196167, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2575995400547981, + "step": 1834 + }, + { + "epoch": 0.11475, + "grad_norm": 2.90625, + "grad_norm_var": 0.023444620768229167, + "learning_rate": 0.0001, + "loss": 8.086, + "loss/crossentropy": 2.173088550567627, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25725461542606354, + "step": 1836 + }, + { + "epoch": 0.114875, + "grad_norm": 2.953125, + "grad_norm_var": 0.015576171875, + "learning_rate": 0.0001, + "loss": 8.013, + "loss/crossentropy": 2.167203664779663, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2551988810300827, + "step": 1838 + }, + { + "epoch": 0.115, + "grad_norm": 2.828125, + "grad_norm_var": 0.011714680989583334, + "learning_rate": 0.0001, + "loss": 8.0137, + "loss/crossentropy": 2.324142336845398, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.25543810427188873, + "step": 1840 + }, + { + "epoch": 0.115125, + "grad_norm": 2.6875, + "grad_norm_var": 0.008675130208333333, + "learning_rate": 0.0001, + "loss": 7.8792, + "loss/crossentropy": 2.3638752698898315, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26999443769454956, + "step": 1842 + }, + { + "epoch": 0.11525, + "grad_norm": 2.859375, + "grad_norm_var": 0.014351399739583333, + "learning_rate": 0.0001, + "loss": 8.0577, + "loss/crossentropy": 2.236335277557373, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24563505500555038, + "step": 1844 + }, + { + "epoch": 0.115375, + "grad_norm": 2.78125, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 8.0579, + "loss/crossentropy": 2.3817098140716553, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25307345390319824, + "step": 1846 + }, + { + "epoch": 0.1155, + "grad_norm": 2.71875, + "grad_norm_var": 0.016373697916666666, + "learning_rate": 0.0001, + "loss": 7.796, + "loss/crossentropy": 2.2049105167388916, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2526453882455826, + "step": 1848 + }, + { + "epoch": 0.115625, + "grad_norm": 2.984375, + "grad_norm_var": 0.019429524739583332, + "learning_rate": 0.0001, + "loss": 8.2624, + "loss/crossentropy": 2.544666051864624, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.27838000655174255, + "step": 1850 + }, + { + "epoch": 0.11575, + "grad_norm": 2.859375, + "grad_norm_var": 0.0191070556640625, + "learning_rate": 0.0001, + "loss": 8.0722, + "loss/crossentropy": 2.4957003593444824, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25582027435302734, + "step": 1852 + }, + { + "epoch": 0.115875, + "grad_norm": 3.21875, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 0.0001, + "loss": 7.9738, + "loss/crossentropy": 2.2852269411087036, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.27101629972457886, + "step": 1854 + }, + { + "epoch": 0.116, + "grad_norm": 2.953125, + "grad_norm_var": 0.030790201822916665, + "learning_rate": 0.0001, + "loss": 7.9897, + "loss/crossentropy": 2.1064014434814453, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24057136476039886, + "step": 1856 + }, + { + "epoch": 0.116125, + "grad_norm": 3.140625, + "grad_norm_var": 0.03762105305989583, + "learning_rate": 0.0001, + "loss": 8.1089, + "loss/crossentropy": 2.352795124053955, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.23926259577274323, + "step": 1858 + }, + { + "epoch": 0.11625, + "grad_norm": 2.484375, + "grad_norm_var": 0.04638671875, + "learning_rate": 0.0001, + "loss": 7.8246, + "loss/crossentropy": 2.085222840309143, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.21554403752088547, + "step": 1860 + }, + { + "epoch": 0.116375, + "grad_norm": 2.65625, + "grad_norm_var": 0.0461578369140625, + "learning_rate": 0.0001, + "loss": 7.9895, + "loss/crossentropy": 1.9475982785224915, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2195342779159546, + "step": 1862 + }, + { + "epoch": 0.1165, + "grad_norm": 2.703125, + "grad_norm_var": 0.04849853515625, + "learning_rate": 0.0001, + "loss": 7.9607, + "loss/crossentropy": 2.4439034461975098, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27569329738616943, + "step": 1864 + }, + { + "epoch": 0.116625, + "grad_norm": 3.328125, + "grad_norm_var": 0.06008199055989583, + "learning_rate": 0.0001, + "loss": 7.9813, + "loss/crossentropy": 2.3087748289108276, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2708819806575775, + "step": 1866 + }, + { + "epoch": 0.11675, + "grad_norm": 2.859375, + "grad_norm_var": 0.06638895670572917, + "learning_rate": 0.0001, + "loss": 8.0842, + "loss/crossentropy": 2.2168221473693848, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2662223279476166, + "step": 1868 + }, + { + "epoch": 0.116875, + "grad_norm": 2.59375, + "grad_norm_var": 0.06785380045572917, + "learning_rate": 0.0001, + "loss": 8.0107, + "loss/crossentropy": 2.008695662021637, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.24834764003753662, + "step": 1870 + }, + { + "epoch": 0.117, + "grad_norm": 3.109375, + "grad_norm_var": 0.07073160807291666, + "learning_rate": 0.0001, + "loss": 8.1764, + "loss/crossentropy": 2.2949352860450745, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23978671431541443, + "step": 1872 + }, + { + "epoch": 0.117125, + "grad_norm": 2.8125, + "grad_norm_var": 0.0585357666015625, + "learning_rate": 0.0001, + "loss": 7.9191, + "loss/crossentropy": 2.1248743534088135, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24373694509267807, + "step": 1874 + }, + { + "epoch": 0.11725, + "grad_norm": 5.25, + "grad_norm_var": 0.4083984375, + "learning_rate": 0.0001, + "loss": 8.1591, + "loss/crossentropy": 2.4608160257339478, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2699667811393738, + "step": 1876 + }, + { + "epoch": 0.117375, + "grad_norm": 3.765625, + "grad_norm_var": 0.43925679524739586, + "learning_rate": 0.0001, + "loss": 8.2212, + "loss/crossentropy": 2.192078948020935, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26026079058647156, + "step": 1878 + }, + { + "epoch": 0.1175, + "grad_norm": 3.25, + "grad_norm_var": 0.4112701416015625, + "learning_rate": 0.0001, + "loss": 7.974, + "loss/crossentropy": 2.079145610332489, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2560829073190689, + "step": 1880 + }, + { + "epoch": 0.117625, + "grad_norm": 3.078125, + "grad_norm_var": 0.4034657796223958, + "learning_rate": 0.0001, + "loss": 8.0137, + "loss/crossentropy": 2.2801836133003235, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2451099008321762, + "step": 1882 + }, + { + "epoch": 0.11775, + "grad_norm": 2.734375, + "grad_norm_var": 0.43884175618489585, + "learning_rate": 0.0001, + "loss": 8.0881, + "loss/crossentropy": 2.373893141746521, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26466645300388336, + "step": 1884 + }, + { + "epoch": 0.117875, + "grad_norm": 3.0, + "grad_norm_var": 0.413330078125, + "learning_rate": 0.0001, + "loss": 7.9685, + "loss/crossentropy": 2.411695957183838, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2512810304760933, + "step": 1886 + }, + { + "epoch": 0.118, + "grad_norm": 2.640625, + "grad_norm_var": 0.42668355305989586, + "learning_rate": 0.0001, + "loss": 8.0557, + "loss/crossentropy": 2.049705147743225, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2417662888765335, + "step": 1888 + }, + { + "epoch": 0.118125, + "grad_norm": 2.890625, + "grad_norm_var": 0.44470113118489585, + "learning_rate": 0.0001, + "loss": 8.0032, + "loss/crossentropy": 2.1323426961898804, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23288051038980484, + "step": 1890 + }, + { + "epoch": 0.11825, + "grad_norm": 3.203125, + "grad_norm_var": 0.1299957275390625, + "learning_rate": 0.0001, + "loss": 8.1034, + "loss/crossentropy": 2.246406674385071, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26154619455337524, + "step": 1892 + }, + { + "epoch": 0.118375, + "grad_norm": 3.1875, + "grad_norm_var": 0.07519429524739583, + "learning_rate": 0.0001, + "loss": 8.2607, + "loss/crossentropy": 2.3726965188980103, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2482641637325287, + "step": 1894 + }, + { + "epoch": 0.1185, + "grad_norm": 2.921875, + "grad_norm_var": 0.06081441243489583, + "learning_rate": 0.0001, + "loss": 8.0314, + "loss/crossentropy": 2.2549182176589966, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2727925777435303, + "step": 1896 + }, + { + "epoch": 0.118625, + "grad_norm": 2.609375, + "grad_norm_var": 0.06313374837239584, + "learning_rate": 0.0001, + "loss": 8.2286, + "loss/crossentropy": 2.471170663833618, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27910932898521423, + "step": 1898 + }, + { + "epoch": 0.11875, + "grad_norm": 2.859375, + "grad_norm_var": 0.05249735514322917, + "learning_rate": 0.0001, + "loss": 8.0647, + "loss/crossentropy": 2.6244795322418213, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.28073398768901825, + "step": 1900 + }, + { + "epoch": 0.118875, + "grad_norm": 2.71875, + "grad_norm_var": 0.049128214518229164, + "learning_rate": 0.0001, + "loss": 7.8933, + "loss/crossentropy": 2.3852927684783936, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26651833206415176, + "step": 1902 + }, + { + "epoch": 0.119, + "grad_norm": 2.703125, + "grad_norm_var": 0.035521443684895834, + "learning_rate": 0.0001, + "loss": 8.0328, + "loss/crossentropy": 2.5933183431625366, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.271125927567482, + "step": 1904 + }, + { + "epoch": 0.119125, + "grad_norm": 3.4375, + "grad_norm_var": 0.05452372233072917, + "learning_rate": 0.0001, + "loss": 8.0956, + "loss/crossentropy": 2.2415446043014526, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.258744515478611, + "step": 1906 + }, + { + "epoch": 0.11925, + "grad_norm": 2.390625, + "grad_norm_var": 0.06295166015625, + "learning_rate": 0.0001, + "loss": 7.882, + "loss/crossentropy": 2.1915100812911987, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26428738236427307, + "step": 1908 + }, + { + "epoch": 0.119375, + "grad_norm": 2.9375, + "grad_norm_var": 0.059300740559895836, + "learning_rate": 0.0001, + "loss": 7.8918, + "loss/crossentropy": 2.024084210395813, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23683403432369232, + "step": 1910 + }, + { + "epoch": 0.1195, + "grad_norm": 2.546875, + "grad_norm_var": 0.08255208333333333, + "learning_rate": 0.0001, + "loss": 8.1278, + "loss/crossentropy": 2.103874683380127, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2549164593219757, + "step": 1912 + }, + { + "epoch": 0.119625, + "grad_norm": 2.859375, + "grad_norm_var": 0.0891021728515625, + "learning_rate": 0.0001, + "loss": 8.0439, + "loss/crossentropy": 2.1909857988357544, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25458941608667374, + "step": 1914 + }, + { + "epoch": 0.11975, + "grad_norm": 2.734375, + "grad_norm_var": 0.0890625, + "learning_rate": 0.0001, + "loss": 8.0189, + "loss/crossentropy": 2.3163031339645386, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26227420568466187, + "step": 1916 + }, + { + "epoch": 0.119875, + "grad_norm": 2.984375, + "grad_norm_var": 0.0921539306640625, + "learning_rate": 0.0001, + "loss": 8.1534, + "loss/crossentropy": 2.4677486419677734, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2759769409894943, + "step": 1918 + }, + { + "epoch": 0.12, + "grad_norm": 2.859375, + "grad_norm_var": 0.09339192708333334, + "learning_rate": 0.0001, + "loss": 8.004, + "loss/crossentropy": 2.3451250791549683, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.24791867285966873, + "step": 1920 + }, + { + "epoch": 0.120125, + "grad_norm": 2.53125, + "grad_norm_var": 0.06575113932291667, + "learning_rate": 0.0001, + "loss": 7.9536, + "loss/crossentropy": 2.39498770236969, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.26757124066352844, + "step": 1922 + }, + { + "epoch": 0.12025, + "grad_norm": 2.734375, + "grad_norm_var": 0.059912109375, + "learning_rate": 0.0001, + "loss": 7.9078, + "loss/crossentropy": 2.137160062789917, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.23972496390342712, + "step": 1924 + }, + { + "epoch": 0.120375, + "grad_norm": 3.203125, + "grad_norm_var": 0.06774800618489583, + "learning_rate": 0.0001, + "loss": 7.9543, + "loss/crossentropy": 2.354183554649353, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2630910202860832, + "step": 1926 + }, + { + "epoch": 0.1205, + "grad_norm": 2.796875, + "grad_norm_var": 0.04582926432291667, + "learning_rate": 0.0001, + "loss": 7.9081, + "loss/crossentropy": 2.0638818740844727, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2652597352862358, + "step": 1928 + }, + { + "epoch": 0.120625, + "grad_norm": 3.203125, + "grad_norm_var": 0.052534993489583334, + "learning_rate": 0.0001, + "loss": 8.0497, + "loss/crossentropy": 2.588783383369446, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2830745279788971, + "step": 1930 + }, + { + "epoch": 0.12075, + "grad_norm": 2.671875, + "grad_norm_var": 0.05359700520833333, + "learning_rate": 0.0001, + "loss": 8.0327, + "loss/crossentropy": 2.223568558692932, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25885971635580063, + "step": 1932 + }, + { + "epoch": 0.120875, + "grad_norm": 2.78125, + "grad_norm_var": 0.049641927083333336, + "learning_rate": 0.0001, + "loss": 7.9963, + "loss/crossentropy": 2.4340078830718994, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2645603120326996, + "step": 1934 + }, + { + "epoch": 0.121, + "grad_norm": 2.84375, + "grad_norm_var": 0.048046875, + "learning_rate": 0.0001, + "loss": 7.7813, + "loss/crossentropy": 1.9766615629196167, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.22679174691438675, + "step": 1936 + }, + { + "epoch": 0.121125, + "grad_norm": 2.796875, + "grad_norm_var": 0.043782552083333336, + "learning_rate": 0.0001, + "loss": 8.0526, + "loss/crossentropy": 2.007621169090271, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.232595793902874, + "step": 1938 + }, + { + "epoch": 0.12125, + "grad_norm": 2.875, + "grad_norm_var": 0.03530171712239583, + "learning_rate": 0.0001, + "loss": 8.0147, + "loss/crossentropy": 2.245633602142334, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2521408647298813, + "step": 1940 + }, + { + "epoch": 0.121375, + "grad_norm": 2.859375, + "grad_norm_var": 0.05499674479166667, + "learning_rate": 0.0001, + "loss": 8.1478, + "loss/crossentropy": 2.239235758781433, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.29093019664287567, + "step": 1942 + }, + { + "epoch": 0.1215, + "grad_norm": 3.125, + "grad_norm_var": 0.05185139973958333, + "learning_rate": 0.0001, + "loss": 8.0329, + "loss/crossentropy": 2.219251275062561, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24115260690450668, + "step": 1944 + }, + { + "epoch": 0.121625, + "grad_norm": 3.171875, + "grad_norm_var": 0.079296875, + "learning_rate": 0.0001, + "loss": 8.2787, + "loss/crossentropy": 2.3880057334899902, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2628382295370102, + "step": 1946 + }, + { + "epoch": 0.12175, + "grad_norm": 2.640625, + "grad_norm_var": 0.07984619140625, + "learning_rate": 0.0001, + "loss": 8.0593, + "loss/crossentropy": 1.9636898040771484, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.22413796931505203, + "step": 1948 + }, + { + "epoch": 0.121875, + "grad_norm": 3.09375, + "grad_norm_var": 0.07724202473958333, + "learning_rate": 0.0001, + "loss": 8.2408, + "loss/crossentropy": 2.4159456491470337, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2786535918712616, + "step": 1950 + }, + { + "epoch": 0.122, + "grad_norm": 2.9375, + "grad_norm_var": 0.06634114583333334, + "learning_rate": 0.0001, + "loss": 8.0623, + "loss/crossentropy": 2.125056028366089, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2584911435842514, + "step": 1952 + }, + { + "epoch": 0.122125, + "grad_norm": 2.8125, + "grad_norm_var": 0.07579752604166666, + "learning_rate": 0.0001, + "loss": 8.2316, + "loss/crossentropy": 2.15469229221344, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24431538581848145, + "step": 1954 + }, + { + "epoch": 0.12225, + "grad_norm": 2.828125, + "grad_norm_var": 0.0720855712890625, + "learning_rate": 0.0001, + "loss": 7.9661, + "loss/crossentropy": 2.124357581138611, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25534383952617645, + "step": 1956 + }, + { + "epoch": 0.122375, + "grad_norm": 2.6875, + "grad_norm_var": 0.058756510416666664, + "learning_rate": 0.0001, + "loss": 8.016, + "loss/crossentropy": 2.344644784927368, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26001378893852234, + "step": 1958 + }, + { + "epoch": 0.1225, + "grad_norm": 2.78125, + "grad_norm_var": 0.059130859375, + "learning_rate": 0.0001, + "loss": 8.0888, + "loss/crossentropy": 2.242557406425476, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25998418033123016, + "step": 1960 + }, + { + "epoch": 0.122625, + "grad_norm": 2.578125, + "grad_norm_var": 0.027339680989583334, + "learning_rate": 0.0001, + "loss": 7.9998, + "loss/crossentropy": 2.1519815921783447, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24088148772716522, + "step": 1962 + }, + { + "epoch": 0.12275, + "grad_norm": 3.09375, + "grad_norm_var": 0.0291015625, + "learning_rate": 0.0001, + "loss": 7.9661, + "loss/crossentropy": 2.0413911938667297, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2583937346935272, + "step": 1964 + }, + { + "epoch": 0.122875, + "grad_norm": 2.5625, + "grad_norm_var": 0.04091695149739583, + "learning_rate": 0.0001, + "loss": 8.0222, + "loss/crossentropy": 2.404345154762268, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.23985719680786133, + "step": 1966 + }, + { + "epoch": 0.123, + "grad_norm": 2.71875, + "grad_norm_var": 0.044709269205729166, + "learning_rate": 0.0001, + "loss": 8.0141, + "loss/crossentropy": 2.6169755458831787, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2697141170501709, + "step": 1968 + }, + { + "epoch": 0.123125, + "grad_norm": 2.90625, + "grad_norm_var": 0.045832316080729164, + "learning_rate": 0.0001, + "loss": 8.0295, + "loss/crossentropy": 2.2693088054656982, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2512510120868683, + "step": 1970 + }, + { + "epoch": 0.12325, + "grad_norm": 2.578125, + "grad_norm_var": 0.050699869791666664, + "learning_rate": 0.0001, + "loss": 7.9091, + "loss/crossentropy": 2.4330859184265137, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2577967271208763, + "step": 1972 + }, + { + "epoch": 0.123375, + "grad_norm": 3.046875, + "grad_norm_var": 0.052490234375, + "learning_rate": 0.0001, + "loss": 8.1396, + "loss/crossentropy": 2.560065507888794, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2873089164495468, + "step": 1974 + }, + { + "epoch": 0.1235, + "grad_norm": 3.1875, + "grad_norm_var": 0.060155232747395836, + "learning_rate": 0.0001, + "loss": 7.8746, + "loss/crossentropy": 2.174700140953064, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2541915774345398, + "step": 1976 + }, + { + "epoch": 0.123625, + "grad_norm": 2.765625, + "grad_norm_var": 0.05668843587239583, + "learning_rate": 0.0001, + "loss": 7.947, + "loss/crossentropy": 2.164485454559326, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25544650852680206, + "step": 1978 + }, + { + "epoch": 0.12375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06304423014322917, + "learning_rate": 0.0001, + "loss": 8.1361, + "loss/crossentropy": 2.110231041908264, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2481146827340126, + "step": 1980 + }, + { + "epoch": 0.123875, + "grad_norm": 2.78125, + "grad_norm_var": 0.047587076822916664, + "learning_rate": 0.0001, + "loss": 8.0873, + "loss/crossentropy": 2.4308364391326904, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2560970336198807, + "step": 1982 + }, + { + "epoch": 0.124, + "grad_norm": 2.828125, + "grad_norm_var": 0.0412750244140625, + "learning_rate": 0.0001, + "loss": 8.0323, + "loss/crossentropy": 2.481392025947571, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2720007449388504, + "step": 1984 + }, + { + "epoch": 0.124125, + "grad_norm": 2.9375, + "grad_norm_var": 0.041975911458333334, + "learning_rate": 0.0001, + "loss": 7.857, + "loss/crossentropy": 2.1386367082595825, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26166096329689026, + "step": 1986 + }, + { + "epoch": 0.12425, + "grad_norm": 2.6875, + "grad_norm_var": 0.04220377604166667, + "learning_rate": 0.0001, + "loss": 8.0093, + "loss/crossentropy": 2.333972215652466, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25886698067188263, + "step": 1988 + }, + { + "epoch": 0.124375, + "grad_norm": 3.265625, + "grad_norm_var": 0.05032145182291667, + "learning_rate": 0.0001, + "loss": 8.0834, + "loss/crossentropy": 2.41828191280365, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24262161552906036, + "step": 1990 + }, + { + "epoch": 0.1245, + "grad_norm": 2.53125, + "grad_norm_var": 0.046052042643229166, + "learning_rate": 0.0001, + "loss": 8.0389, + "loss/crossentropy": 2.1740458011627197, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.23383785039186478, + "step": 1992 + }, + { + "epoch": 0.124625, + "grad_norm": 2.5625, + "grad_norm_var": 0.049117024739583334, + "learning_rate": 0.0001, + "loss": 8.0157, + "loss/crossentropy": 2.2331719398498535, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2640424221754074, + "step": 1994 + }, + { + "epoch": 0.12475, + "grad_norm": 3.015625, + "grad_norm_var": 0.04023335774739583, + "learning_rate": 0.0001, + "loss": 7.9991, + "loss/crossentropy": 2.1523889303207397, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2687607556581497, + "step": 1996 + }, + { + "epoch": 0.124875, + "grad_norm": 2.859375, + "grad_norm_var": 0.040526326497395834, + "learning_rate": 0.0001, + "loss": 7.9342, + "loss/crossentropy": 2.171301484107971, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.24432216584682465, + "step": 1998 + }, + { + "epoch": 0.125, + "grad_norm": 2.78125, + "grad_norm_var": 0.04273681640625, + "learning_rate": 0.0001, + "loss": 8.1111, + "loss/crossentropy": 2.2224671840667725, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.23374950140714645, + "step": 2000 + }, + { + "epoch": 0.125125, + "grad_norm": 2.5, + "grad_norm_var": 0.04103902180989583, + "learning_rate": 0.0001, + "loss": 8.0055, + "loss/crossentropy": 2.1209537386894226, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2525208741426468, + "step": 2002 + }, + { + "epoch": 0.12525, + "grad_norm": 2.5625, + "grad_norm_var": 0.04299723307291667, + "learning_rate": 0.0001, + "loss": 7.8668, + "loss/crossentropy": 2.1079421639442444, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25046999752521515, + "step": 2004 + }, + { + "epoch": 0.125375, + "grad_norm": 3.078125, + "grad_norm_var": 0.03804931640625, + "learning_rate": 0.0001, + "loss": 7.9843, + "loss/crossentropy": 2.3907183408737183, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2595418617129326, + "step": 2006 + }, + { + "epoch": 0.1255, + "grad_norm": 2.78125, + "grad_norm_var": 0.0336578369140625, + "learning_rate": 0.0001, + "loss": 7.9723, + "loss/crossentropy": 2.081270694732666, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25873951613903046, + "step": 2008 + }, + { + "epoch": 0.125625, + "grad_norm": 3.046875, + "grad_norm_var": 0.033675130208333334, + "learning_rate": 0.0001, + "loss": 8.0587, + "loss/crossentropy": 2.204562723636627, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24240562319755554, + "step": 2010 + }, + { + "epoch": 0.12575, + "grad_norm": 2.6875, + "grad_norm_var": 0.030549112955729166, + "learning_rate": 0.0001, + "loss": 8.0825, + "loss/crossentropy": 2.234739661216736, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23210185766220093, + "step": 2012 + }, + { + "epoch": 0.125875, + "grad_norm": 2.984375, + "grad_norm_var": 0.03386128743489583, + "learning_rate": 0.0001, + "loss": 7.9408, + "loss/crossentropy": 2.155194342136383, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.24713550508022308, + "step": 2014 + }, + { + "epoch": 0.126, + "grad_norm": 2.921875, + "grad_norm_var": 0.03388671875, + "learning_rate": 0.0001, + "loss": 8.0006, + "loss/crossentropy": 2.1440590620040894, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24631793051958084, + "step": 2016 + }, + { + "epoch": 0.126125, + "grad_norm": 3.375, + "grad_norm_var": 0.06747945149739583, + "learning_rate": 0.0001, + "loss": 8.2657, + "loss/crossentropy": 2.2782651782035828, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25610214471817017, + "step": 2018 + }, + { + "epoch": 0.12625, + "grad_norm": 2.703125, + "grad_norm_var": 0.0567535400390625, + "learning_rate": 0.0001, + "loss": 7.9078, + "loss/crossentropy": 2.2536474466323853, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23063694685697556, + "step": 2020 + }, + { + "epoch": 0.126375, + "grad_norm": 3.890625, + "grad_norm_var": 0.1236724853515625, + "learning_rate": 0.0001, + "loss": 8.0493, + "loss/crossentropy": 2.2990732192993164, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.28477251529693604, + "step": 2022 + }, + { + "epoch": 0.1265, + "grad_norm": 3.15625, + "grad_norm_var": 0.1252593994140625, + "learning_rate": 0.0001, + "loss": 8.0902, + "loss/crossentropy": 2.3423362970352173, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27799367904663086, + "step": 2024 + }, + { + "epoch": 0.126625, + "grad_norm": 3.359375, + "grad_norm_var": 0.7259999593098958, + "learning_rate": 0.0001, + "loss": 8.3429, + "loss/crossentropy": 2.1187247037887573, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25325731933116913, + "step": 2026 + }, + { + "epoch": 0.12675, + "grad_norm": 3.328125, + "grad_norm_var": 0.6796061197916666, + "learning_rate": 0.0001, + "loss": 8.2877, + "loss/crossentropy": 2.6572694778442383, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.28467129170894623, + "step": 2028 + }, + { + "epoch": 0.126875, + "grad_norm": 2.953125, + "grad_norm_var": 0.6576171875, + "learning_rate": 0.0001, + "loss": 8.1366, + "loss/crossentropy": 2.149785280227661, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.33148565888404846, + "step": 2030 + }, + { + "epoch": 0.127, + "grad_norm": 2.984375, + "grad_norm_var": 0.6441243489583334, + "learning_rate": 0.0001, + "loss": 8.016, + "loss/crossentropy": 1.9550745487213135, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.21983042359352112, + "step": 2032 + }, + { + "epoch": 0.127125, + "grad_norm": 2.59375, + "grad_norm_var": 0.70074462890625, + "learning_rate": 0.0001, + "loss": 7.9985, + "loss/crossentropy": 2.4743508100509644, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2646654099225998, + "step": 2034 + }, + { + "epoch": 0.12725, + "grad_norm": 2.953125, + "grad_norm_var": 0.679931640625, + "learning_rate": 0.0001, + "loss": 7.7714, + "loss/crossentropy": 2.2703075408935547, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24541212618350983, + "step": 2036 + }, + { + "epoch": 0.127375, + "grad_norm": 2.875, + "grad_norm_var": 0.66455078125, + "learning_rate": 0.0001, + "loss": 8.2056, + "loss/crossentropy": 2.400794506072998, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2792155146598816, + "step": 2038 + }, + { + "epoch": 0.1275, + "grad_norm": 2.703125, + "grad_norm_var": 0.7059529622395834, + "learning_rate": 0.0001, + "loss": 7.9121, + "loss/crossentropy": 2.221606969833374, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27579738199710846, + "step": 2040 + }, + { + "epoch": 0.127625, + "grad_norm": 2.875, + "grad_norm_var": 0.05742085774739583, + "learning_rate": 0.0001, + "loss": 8.0984, + "loss/crossentropy": 2.1297446489334106, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2457616627216339, + "step": 2042 + }, + { + "epoch": 0.12775, + "grad_norm": 6.28125, + "grad_norm_var": 0.7566802978515625, + "learning_rate": 0.0001, + "loss": 8.1542, + "loss/crossentropy": 2.068065047264099, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2585330307483673, + "step": 2044 + }, + { + "epoch": 0.127875, + "grad_norm": 3.515625, + "grad_norm_var": 0.7755767822265625, + "learning_rate": 0.0001, + "loss": 8.2794, + "loss/crossentropy": 2.285371780395508, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26811927556991577, + "step": 2046 + }, + { + "epoch": 0.128, + "grad_norm": 3.015625, + "grad_norm_var": 0.7772420247395834, + "learning_rate": 0.0001, + "loss": 8.0508, + "loss/crossentropy": 2.102661430835724, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24708286672830582, + "step": 2048 + }, + { + "epoch": 0.128125, + "grad_norm": 2.78125, + "grad_norm_var": 0.7524648030598958, + "learning_rate": 0.0001, + "loss": 7.87, + "loss/crossentropy": 1.9193878173828125, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24057899415493011, + "step": 2050 + }, + { + "epoch": 0.12825, + "grad_norm": 2.828125, + "grad_norm_var": 0.7676096598307292, + "learning_rate": 0.0001, + "loss": 7.9655, + "loss/crossentropy": 2.2599531412124634, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2473764270544052, + "step": 2052 + }, + { + "epoch": 0.128375, + "grad_norm": 3.03125, + "grad_norm_var": 0.7643717447916667, + "learning_rate": 0.0001, + "loss": 8.0443, + "loss/crossentropy": 2.1840824484825134, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.21873797476291656, + "step": 2054 + }, + { + "epoch": 0.1285, + "grad_norm": 2.515625, + "grad_norm_var": 0.7554026285807292, + "learning_rate": 0.0001, + "loss": 7.8934, + "loss/crossentropy": 2.5172749757766724, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2471792846918106, + "step": 2056 + }, + { + "epoch": 0.128625, + "grad_norm": 2.71875, + "grad_norm_var": 0.7780436197916667, + "learning_rate": 0.0001, + "loss": 7.9608, + "loss/crossentropy": 2.5356470346450806, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27692335844039917, + "step": 2058 + }, + { + "epoch": 0.12875, + "grad_norm": 3.03125, + "grad_norm_var": 0.08212483723958333, + "learning_rate": 0.0001, + "loss": 8.1766, + "loss/crossentropy": 2.3050994873046875, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.27711644768714905, + "step": 2060 + }, + { + "epoch": 0.128875, + "grad_norm": 2.78125, + "grad_norm_var": 0.031245930989583334, + "learning_rate": 0.0001, + "loss": 8.0858, + "loss/crossentropy": 2.037451386451721, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.234269917011261, + "step": 2062 + }, + { + "epoch": 0.129, + "grad_norm": 2.609375, + "grad_norm_var": 0.035456339518229164, + "learning_rate": 0.0001, + "loss": 7.9019, + "loss/crossentropy": 2.333581566810608, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2544917017221451, + "step": 2064 + }, + { + "epoch": 0.129125, + "grad_norm": 2.71875, + "grad_norm_var": 0.031248982747395834, + "learning_rate": 0.0001, + "loss": 8.1845, + "loss/crossentropy": 2.3705108165740967, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2595665156841278, + "step": 2066 + }, + { + "epoch": 0.12925, + "grad_norm": 2.96875, + "grad_norm_var": 0.03257548014322917, + "learning_rate": 0.0001, + "loss": 8.0113, + "loss/crossentropy": 2.2181931734085083, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2753777801990509, + "step": 2068 + }, + { + "epoch": 0.129375, + "grad_norm": 2.6875, + "grad_norm_var": 0.028678385416666667, + "learning_rate": 0.0001, + "loss": 8.046, + "loss/crossentropy": 2.4384394884109497, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.260432630777359, + "step": 2070 + }, + { + "epoch": 0.1295, + "grad_norm": 3.03125, + "grad_norm_var": 0.022932942708333334, + "learning_rate": 0.0001, + "loss": 8.1072, + "loss/crossentropy": 2.2950222492218018, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2557060271501541, + "step": 2072 + }, + { + "epoch": 0.129625, + "grad_norm": 2.796875, + "grad_norm_var": 0.0248443603515625, + "learning_rate": 0.0001, + "loss": 7.8764, + "loss/crossentropy": 1.9588146209716797, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2530653849244118, + "step": 2074 + }, + { + "epoch": 0.12975, + "grad_norm": 2.859375, + "grad_norm_var": 0.020759073893229167, + "learning_rate": 0.0001, + "loss": 8.1442, + "loss/crossentropy": 2.174315929412842, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24201688915491104, + "step": 2076 + }, + { + "epoch": 0.129875, + "grad_norm": 2.875, + "grad_norm_var": 0.026642862955729166, + "learning_rate": 0.0001, + "loss": 7.8332, + "loss/crossentropy": 1.8437206149101257, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2369949370622635, + "step": 2078 + }, + { + "epoch": 0.13, + "grad_norm": 2.765625, + "grad_norm_var": 0.0224761962890625, + "learning_rate": 0.0001, + "loss": 8.1136, + "loss/crossentropy": 2.4198756217956543, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2665669322013855, + "step": 2080 + }, + { + "epoch": 0.130125, + "grad_norm": 2.765625, + "grad_norm_var": 0.030143229166666667, + "learning_rate": 0.0001, + "loss": 7.8881, + "loss/crossentropy": 2.051876664161682, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23426489531993866, + "step": 2082 + }, + { + "epoch": 0.13025, + "grad_norm": 2.59375, + "grad_norm_var": 0.031473795572916664, + "learning_rate": 0.0001, + "loss": 8.0093, + "loss/crossentropy": 2.455062747001648, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2640880271792412, + "step": 2084 + }, + { + "epoch": 0.130375, + "grad_norm": 2.875, + "grad_norm_var": 0.03137613932291667, + "learning_rate": 0.0001, + "loss": 8.0219, + "loss/crossentropy": 2.2999762296676636, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2465459704399109, + "step": 2086 + }, + { + "epoch": 0.1305, + "grad_norm": 2.828125, + "grad_norm_var": 0.03183186848958333, + "learning_rate": 0.0001, + "loss": 8.0045, + "loss/crossentropy": 2.2871525287628174, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24762800335884094, + "step": 2088 + }, + { + "epoch": 0.130625, + "grad_norm": 2.953125, + "grad_norm_var": 0.03261617024739583, + "learning_rate": 0.0001, + "loss": 7.8668, + "loss/crossentropy": 2.3214457035064697, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.25224703550338745, + "step": 2090 + }, + { + "epoch": 0.13075, + "grad_norm": 2.390625, + "grad_norm_var": 0.042313639322916666, + "learning_rate": 0.0001, + "loss": 8.0465, + "loss/crossentropy": 2.1685001850128174, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2591954469680786, + "step": 2092 + }, + { + "epoch": 0.130875, + "grad_norm": 3.078125, + "grad_norm_var": 0.041136678059895834, + "learning_rate": 0.0001, + "loss": 8.0284, + "loss/crossentropy": 2.5781397819519043, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26295357197523117, + "step": 2094 + }, + { + "epoch": 0.131, + "grad_norm": 2.5, + "grad_norm_var": 0.045466105143229164, + "learning_rate": 0.0001, + "loss": 7.8661, + "loss/crossentropy": 2.1926894187927246, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23267576098442078, + "step": 2096 + }, + { + "epoch": 0.131125, + "grad_norm": 3.1875, + "grad_norm_var": 0.04641520182291667, + "learning_rate": 0.0001, + "loss": 7.9553, + "loss/crossentropy": 2.1911760568618774, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25542213022708893, + "step": 2098 + }, + { + "epoch": 0.13125, + "grad_norm": 2.78125, + "grad_norm_var": 0.04755452473958333, + "learning_rate": 0.0001, + "loss": 7.958, + "loss/crossentropy": 2.2775418758392334, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24572113156318665, + "step": 2100 + }, + { + "epoch": 0.131375, + "grad_norm": 2.78125, + "grad_norm_var": 0.04748942057291667, + "learning_rate": 0.0001, + "loss": 8.0769, + "loss/crossentropy": 2.3823719024658203, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2616809457540512, + "step": 2102 + }, + { + "epoch": 0.1315, + "grad_norm": 2.90625, + "grad_norm_var": 0.047240193684895834, + "learning_rate": 0.0001, + "loss": 8.0382, + "loss/crossentropy": 2.4556522369384766, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2946828603744507, + "step": 2104 + }, + { + "epoch": 0.131625, + "grad_norm": 3.0, + "grad_norm_var": 0.05573628743489583, + "learning_rate": 0.0001, + "loss": 8.0877, + "loss/crossentropy": 2.195865511894226, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26489073038101196, + "step": 2106 + }, + { + "epoch": 0.13175, + "grad_norm": 2.375, + "grad_norm_var": 0.05614827473958333, + "learning_rate": 0.0001, + "loss": 7.9057, + "loss/crossentropy": 2.373032331466675, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24483656883239746, + "step": 2108 + }, + { + "epoch": 0.131875, + "grad_norm": 2.765625, + "grad_norm_var": 0.055497233072916666, + "learning_rate": 0.0001, + "loss": 8.0201, + "loss/crossentropy": 2.4894858598709106, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2641270160675049, + "step": 2110 + }, + { + "epoch": 0.132, + "grad_norm": 2.71875, + "grad_norm_var": 0.08896484375, + "learning_rate": 0.0001, + "loss": 7.9225, + "loss/crossentropy": 2.312375068664551, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24002444744110107, + "step": 2112 + }, + { + "epoch": 0.132125, + "grad_norm": 2.765625, + "grad_norm_var": 0.0932037353515625, + "learning_rate": 0.0001, + "loss": 7.9214, + "loss/crossentropy": 2.000899076461792, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23694107681512833, + "step": 2114 + }, + { + "epoch": 0.13225, + "grad_norm": 2.9375, + "grad_norm_var": 0.09410807291666666, + "learning_rate": 0.0001, + "loss": 8.0863, + "loss/crossentropy": 2.2505098581314087, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2779449298977852, + "step": 2116 + }, + { + "epoch": 0.132375, + "grad_norm": 2.90625, + "grad_norm_var": 0.0931549072265625, + "learning_rate": 0.0001, + "loss": 8.2278, + "loss/crossentropy": 2.533925771713257, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27186477184295654, + "step": 2118 + }, + { + "epoch": 0.1325, + "grad_norm": 3.0625, + "grad_norm_var": 0.16379292805989584, + "learning_rate": 0.0001, + "loss": 7.9833, + "loss/crossentropy": 2.2135390043258667, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2325379028916359, + "step": 2120 + }, + { + "epoch": 0.132625, + "grad_norm": 3.03125, + "grad_norm_var": 0.1632232666015625, + "learning_rate": 0.0001, + "loss": 8.0453, + "loss/crossentropy": 2.428161382675171, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.26223746687173843, + "step": 2122 + }, + { + "epoch": 0.13275, + "grad_norm": 2.5625, + "grad_norm_var": 0.1512603759765625, + "learning_rate": 0.0001, + "loss": 7.9349, + "loss/crossentropy": 2.6289626359939575, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.272312268614769, + "step": 2124 + }, + { + "epoch": 0.132875, + "grad_norm": 3.015625, + "grad_norm_var": 0.15465087890625, + "learning_rate": 0.0001, + "loss": 8.0484, + "loss/crossentropy": 2.081725239753723, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25036202371120453, + "step": 2126 + }, + { + "epoch": 0.133, + "grad_norm": 3.109375, + "grad_norm_var": 0.12092692057291667, + "learning_rate": 0.0001, + "loss": 8.1453, + "loss/crossentropy": 2.4221293926239014, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2642197906970978, + "step": 2128 + }, + { + "epoch": 0.133125, + "grad_norm": 2.609375, + "grad_norm_var": 0.11172587076822917, + "learning_rate": 0.0001, + "loss": 8.0771, + "loss/crossentropy": 2.385019063949585, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2672020420432091, + "step": 2130 + }, + { + "epoch": 0.13325, + "grad_norm": 3.046875, + "grad_norm_var": 0.10976155598958333, + "learning_rate": 0.0001, + "loss": 7.9391, + "loss/crossentropy": 2.356515049934387, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23601362109184265, + "step": 2132 + }, + { + "epoch": 0.133375, + "grad_norm": 2.796875, + "grad_norm_var": 0.11910400390625, + "learning_rate": 0.0001, + "loss": 7.8618, + "loss/crossentropy": 2.4709160327911377, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26310209929943085, + "step": 2134 + }, + { + "epoch": 0.1335, + "grad_norm": 2.625, + "grad_norm_var": 0.04101155598958333, + "learning_rate": 0.0001, + "loss": 7.9999, + "loss/crossentropy": 2.4431287050247192, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25448351353406906, + "step": 2136 + }, + { + "epoch": 0.133625, + "grad_norm": 2.734375, + "grad_norm_var": 0.0377838134765625, + "learning_rate": 0.0001, + "loss": 8.0923, + "loss/crossentropy": 2.361445426940918, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24513405561447144, + "step": 2138 + }, + { + "epoch": 0.13375, + "grad_norm": 2.84375, + "grad_norm_var": 0.03205464680989583, + "learning_rate": 0.0001, + "loss": 8.0121, + "loss/crossentropy": 2.400641083717346, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2581202983856201, + "step": 2140 + }, + { + "epoch": 0.133875, + "grad_norm": 2.390625, + "grad_norm_var": 0.0412506103515625, + "learning_rate": 0.0001, + "loss": 7.8979, + "loss/crossentropy": 2.0805707573890686, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2818114757537842, + "step": 2142 + }, + { + "epoch": 0.134, + "grad_norm": 3.375, + "grad_norm_var": 0.05451558430989583, + "learning_rate": 0.0001, + "loss": 8.1184, + "loss/crossentropy": 2.480680823326111, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.29633618891239166, + "step": 2144 + }, + { + "epoch": 0.134125, + "grad_norm": 2.625, + "grad_norm_var": 0.0557037353515625, + "learning_rate": 0.0001, + "loss": 7.8102, + "loss/crossentropy": 2.002712309360504, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.23709578067064285, + "step": 2146 + }, + { + "epoch": 0.13425, + "grad_norm": 2.609375, + "grad_norm_var": 0.05565999348958333, + "learning_rate": 0.0001, + "loss": 7.8972, + "loss/crossentropy": 2.405007004737854, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2549060881137848, + "step": 2148 + }, + { + "epoch": 0.134375, + "grad_norm": 2.6875, + "grad_norm_var": 0.05328369140625, + "learning_rate": 0.0001, + "loss": 7.9409, + "loss/crossentropy": 2.137619376182556, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2625036686658859, + "step": 2150 + }, + { + "epoch": 0.1345, + "grad_norm": 2.75, + "grad_norm_var": 0.04928385416666667, + "learning_rate": 0.0001, + "loss": 7.983, + "loss/crossentropy": 2.3216036558151245, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.26807793229818344, + "step": 2152 + }, + { + "epoch": 0.134625, + "grad_norm": 3.265625, + "grad_norm_var": 0.06470947265625, + "learning_rate": 0.0001, + "loss": 7.8514, + "loss/crossentropy": 2.3814263343811035, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2562423348426819, + "step": 2154 + }, + { + "epoch": 0.13475, + "grad_norm": 2.75, + "grad_norm_var": 0.07095947265625, + "learning_rate": 0.0001, + "loss": 8.0623, + "loss/crossentropy": 2.3068708181381226, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2527267262339592, + "step": 2156 + }, + { + "epoch": 0.134875, + "grad_norm": 2.609375, + "grad_norm_var": 0.05712483723958333, + "learning_rate": 0.0001, + "loss": 8.1027, + "loss/crossentropy": 2.242267608642578, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2507360577583313, + "step": 2158 + }, + { + "epoch": 0.135, + "grad_norm": 2.671875, + "grad_norm_var": 0.04262593587239583, + "learning_rate": 0.0001, + "loss": 7.7535, + "loss/crossentropy": 2.259010672569275, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2302761897444725, + "step": 2160 + }, + { + "epoch": 0.135125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04046223958333333, + "learning_rate": 0.0001, + "loss": 7.8616, + "loss/crossentropy": 2.195171058177948, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23362931609153748, + "step": 2162 + }, + { + "epoch": 0.13525, + "grad_norm": 2.765625, + "grad_norm_var": 0.03961588541666667, + "learning_rate": 0.0001, + "loss": 7.9037, + "loss/crossentropy": 2.306097149848938, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.27228541672229767, + "step": 2164 + }, + { + "epoch": 0.135375, + "grad_norm": 2.75, + "grad_norm_var": 0.039937337239583336, + "learning_rate": 0.0001, + "loss": 7.9223, + "loss/crossentropy": 2.354483962059021, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.25157542526721954, + "step": 2166 + }, + { + "epoch": 0.1355, + "grad_norm": 2.953125, + "grad_norm_var": 0.04169514973958333, + "learning_rate": 0.0001, + "loss": 8.0153, + "loss/crossentropy": 2.555723190307617, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2608166038990021, + "step": 2168 + }, + { + "epoch": 0.135625, + "grad_norm": 2.59375, + "grad_norm_var": 0.026976521809895834, + "learning_rate": 0.0001, + "loss": 8.0232, + "loss/crossentropy": 2.314175248146057, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2557505890727043, + "step": 2170 + }, + { + "epoch": 0.13575, + "grad_norm": 2.640625, + "grad_norm_var": 0.0195709228515625, + "learning_rate": 0.0001, + "loss": 7.9036, + "loss/crossentropy": 2.252376437187195, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.26094751060009, + "step": 2172 + }, + { + "epoch": 0.135875, + "grad_norm": 2.578125, + "grad_norm_var": 0.020182291666666668, + "learning_rate": 0.0001, + "loss": 7.7816, + "loss/crossentropy": 2.0808385610580444, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.25295551121234894, + "step": 2174 + }, + { + "epoch": 0.136, + "grad_norm": 2.703125, + "grad_norm_var": 0.016145833333333335, + "learning_rate": 0.0001, + "loss": 7.9248, + "loss/crossentropy": 2.2166486978530884, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2506643235683441, + "step": 2176 + }, + { + "epoch": 0.136125, + "grad_norm": 2.734375, + "grad_norm_var": 0.0152496337890625, + "learning_rate": 0.0001, + "loss": 7.9413, + "loss/crossentropy": 2.4114983081817627, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24629026651382446, + "step": 2178 + }, + { + "epoch": 0.13625, + "grad_norm": 2.984375, + "grad_norm_var": 0.017508951822916667, + "learning_rate": 0.0001, + "loss": 7.9527, + "loss/crossentropy": 1.9971612095832825, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23709237575531006, + "step": 2180 + }, + { + "epoch": 0.136375, + "grad_norm": 2.734375, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 8.0183, + "loss/crossentropy": 2.598210096359253, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2603771388530731, + "step": 2182 + }, + { + "epoch": 0.1365, + "grad_norm": 2.9375, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 8.0132, + "loss/crossentropy": 2.382105231285095, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24392583966255188, + "step": 2184 + }, + { + "epoch": 0.136625, + "grad_norm": 2.59375, + "grad_norm_var": 0.022834269205729167, + "learning_rate": 0.0001, + "loss": 8.047, + "loss/crossentropy": 2.4047285318374634, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27039480209350586, + "step": 2186 + }, + { + "epoch": 0.13675, + "grad_norm": 2.4375, + "grad_norm_var": 0.032938639322916664, + "learning_rate": 0.0001, + "loss": 7.7846, + "loss/crossentropy": 2.0133553743362427, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.242934912443161, + "step": 2188 + }, + { + "epoch": 0.136875, + "grad_norm": 3.078125, + "grad_norm_var": 0.043701171875, + "learning_rate": 0.0001, + "loss": 8.1042, + "loss/crossentropy": 2.4624531269073486, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26904913038015366, + "step": 2190 + }, + { + "epoch": 0.137, + "grad_norm": 2.640625, + "grad_norm_var": 0.048216756184895834, + "learning_rate": 0.0001, + "loss": 7.9589, + "loss/crossentropy": 2.3441646099090576, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2756097614765167, + "step": 2192 + }, + { + "epoch": 0.137125, + "grad_norm": 2.609375, + "grad_norm_var": 0.0473297119140625, + "learning_rate": 0.0001, + "loss": 7.9626, + "loss/crossentropy": 2.0180709958076477, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.22330156713724136, + "step": 2194 + }, + { + "epoch": 0.13725, + "grad_norm": 4.875, + "grad_norm_var": 0.3212076822916667, + "learning_rate": 0.0001, + "loss": 8.1305, + "loss/crossentropy": 2.247686982154846, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.257401205599308, + "step": 2196 + }, + { + "epoch": 0.137375, + "grad_norm": 3.125, + "grad_norm_var": 0.3214182535807292, + "learning_rate": 0.0001, + "loss": 7.8432, + "loss/crossentropy": 2.387032985687256, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2718297243118286, + "step": 2198 + }, + { + "epoch": 0.1375, + "grad_norm": 2.84375, + "grad_norm_var": 0.3174957275390625, + "learning_rate": 0.0001, + "loss": 7.9656, + "loss/crossentropy": 2.1113094091415405, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2628230005502701, + "step": 2200 + }, + { + "epoch": 0.137625, + "grad_norm": 2.84375, + "grad_norm_var": 0.3123931884765625, + "learning_rate": 0.0001, + "loss": 8.1272, + "loss/crossentropy": 2.681854248046875, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.30167150497436523, + "step": 2202 + }, + { + "epoch": 0.13775, + "grad_norm": 2.71875, + "grad_norm_var": 0.28544514973958335, + "learning_rate": 0.0001, + "loss": 7.8842, + "loss/crossentropy": 2.2539913654327393, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2667968273162842, + "step": 2204 + }, + { + "epoch": 0.137875, + "grad_norm": 2.984375, + "grad_norm_var": 0.28813374837239586, + "learning_rate": 0.0001, + "loss": 7.994, + "loss/crossentropy": 2.336976170539856, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2730831503868103, + "step": 2206 + }, + { + "epoch": 0.138, + "grad_norm": 2.90625, + "grad_norm_var": 0.28128153483072915, + "learning_rate": 0.0001, + "loss": 7.8908, + "loss/crossentropy": 2.137080729007721, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2353602722287178, + "step": 2208 + }, + { + "epoch": 0.138125, + "grad_norm": 2.5, + "grad_norm_var": 0.28933919270833336, + "learning_rate": 0.0001, + "loss": 7.8058, + "loss/crossentropy": 2.0887175798416138, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2580728679895401, + "step": 2210 + }, + { + "epoch": 0.13825, + "grad_norm": 2.640625, + "grad_norm_var": 0.030269368489583334, + "learning_rate": 0.0001, + "loss": 8.0849, + "loss/crossentropy": 1.7958271503448486, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24919769912958145, + "step": 2212 + }, + { + "epoch": 0.138375, + "grad_norm": 2.640625, + "grad_norm_var": 0.0386871337890625, + "learning_rate": 0.0001, + "loss": 7.8756, + "loss/crossentropy": 2.135870099067688, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24682459235191345, + "step": 2214 + }, + { + "epoch": 0.1385, + "grad_norm": 2.734375, + "grad_norm_var": 0.08369038899739584, + "learning_rate": 0.0001, + "loss": 8.0304, + "loss/crossentropy": 2.346623420715332, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.24961213767528534, + "step": 2216 + }, + { + "epoch": 0.138625, + "grad_norm": 2.703125, + "grad_norm_var": 0.07737630208333333, + "learning_rate": 0.0001, + "loss": 8.0556, + "loss/crossentropy": 2.195094585418701, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.24484457075595856, + "step": 2218 + }, + { + "epoch": 0.13875, + "grad_norm": 2.703125, + "grad_norm_var": 0.07613525390625, + "learning_rate": 0.0001, + "loss": 8.0197, + "loss/crossentropy": 2.2862359285354614, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2567315921187401, + "step": 2220 + }, + { + "epoch": 0.138875, + "grad_norm": 2.546875, + "grad_norm_var": 0.07868550618489584, + "learning_rate": 0.0001, + "loss": 7.9319, + "loss/crossentropy": 2.331244111061096, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2514599338173866, + "step": 2222 + }, + { + "epoch": 0.139, + "grad_norm": 3.15625, + "grad_norm_var": 0.08726806640625, + "learning_rate": 0.0001, + "loss": 7.8826, + "loss/crossentropy": 1.9546465873718262, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23689769953489304, + "step": 2224 + }, + { + "epoch": 0.139125, + "grad_norm": 2.3125, + "grad_norm_var": 0.09897359212239583, + "learning_rate": 0.0001, + "loss": 7.859, + "loss/crossentropy": 2.0505433082580566, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.24545447528362274, + "step": 2226 + }, + { + "epoch": 0.13925, + "grad_norm": 2.828125, + "grad_norm_var": 0.10256245930989584, + "learning_rate": 0.0001, + "loss": 7.9768, + "loss/crossentropy": 2.3643672466278076, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2489551082253456, + "step": 2228 + }, + { + "epoch": 0.139375, + "grad_norm": 2.96875, + "grad_norm_var": 0.09179280598958334, + "learning_rate": 0.0001, + "loss": 7.9637, + "loss/crossentropy": 2.4726024866104126, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2436269074678421, + "step": 2230 + }, + { + "epoch": 0.1395, + "grad_norm": 2.765625, + "grad_norm_var": 0.04537760416666667, + "learning_rate": 0.0001, + "loss": 8.1119, + "loss/crossentropy": 2.409575581550598, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.258441299200058, + "step": 2232 + }, + { + "epoch": 0.139625, + "grad_norm": 2.546875, + "grad_norm_var": 0.04572652180989583, + "learning_rate": 0.0001, + "loss": 7.8413, + "loss/crossentropy": 2.37674617767334, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26087239384651184, + "step": 2234 + }, + { + "epoch": 0.13975, + "grad_norm": 2.796875, + "grad_norm_var": 0.04651590983072917, + "learning_rate": 0.0001, + "loss": 7.9557, + "loss/crossentropy": 2.270553708076477, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24740490317344666, + "step": 2236 + }, + { + "epoch": 0.139875, + "grad_norm": 2.703125, + "grad_norm_var": 0.04517822265625, + "learning_rate": 0.0001, + "loss": 7.8289, + "loss/crossentropy": 2.32234787940979, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2481185868382454, + "step": 2238 + }, + { + "epoch": 0.14, + "grad_norm": 2.71875, + "grad_norm_var": 0.03321024576822917, + "learning_rate": 0.0001, + "loss": 8.0077, + "loss/crossentropy": 2.4945857524871826, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2944895774126053, + "step": 2240 + }, + { + "epoch": 0.140125, + "grad_norm": 2.609375, + "grad_norm_var": 0.0188385009765625, + "learning_rate": 0.0001, + "loss": 7.8994, + "loss/crossentropy": 2.3556969165802, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2561969757080078, + "step": 2242 + }, + { + "epoch": 0.14025, + "grad_norm": 2.8125, + "grad_norm_var": 0.013765462239583333, + "learning_rate": 0.0001, + "loss": 7.9348, + "loss/crossentropy": 2.219905376434326, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.273133248090744, + "step": 2244 + }, + { + "epoch": 0.140375, + "grad_norm": 2.734375, + "grad_norm_var": 0.00855712890625, + "learning_rate": 0.0001, + "loss": 8.0431, + "loss/crossentropy": 2.1384077668190002, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.237950399518013, + "step": 2246 + }, + { + "epoch": 0.1405, + "grad_norm": 2.8125, + "grad_norm_var": 0.0127349853515625, + "learning_rate": 0.0001, + "loss": 7.83, + "loss/crossentropy": 2.3398306369781494, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25574472546577454, + "step": 2248 + }, + { + "epoch": 0.140625, + "grad_norm": 2.484375, + "grad_norm_var": 0.015608723958333333, + "learning_rate": 0.0001, + "loss": 8.033, + "loss/crossentropy": 2.24453866481781, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.23751161247491837, + "step": 2250 + }, + { + "epoch": 0.14075, + "grad_norm": 2.65625, + "grad_norm_var": 0.014388020833333333, + "learning_rate": 0.0001, + "loss": 7.8561, + "loss/crossentropy": 1.9904406070709229, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24353782087564468, + "step": 2252 + }, + { + "epoch": 0.140875, + "grad_norm": 2.578125, + "grad_norm_var": 0.017252604166666668, + "learning_rate": 0.0001, + "loss": 7.866, + "loss/crossentropy": 2.367901563644409, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24962928891181946, + "step": 2254 + }, + { + "epoch": 0.141, + "grad_norm": 2.984375, + "grad_norm_var": 0.0227447509765625, + "learning_rate": 0.0001, + "loss": 8.0945, + "loss/crossentropy": 2.3909614086151123, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2490956410765648, + "step": 2256 + }, + { + "epoch": 0.141125, + "grad_norm": 2.75, + "grad_norm_var": 0.0219635009765625, + "learning_rate": 0.0001, + "loss": 8.0642, + "loss/crossentropy": 2.158316493034363, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2531931698322296, + "step": 2258 + }, + { + "epoch": 0.14125, + "grad_norm": 2.671875, + "grad_norm_var": 0.0224761962890625, + "learning_rate": 0.0001, + "loss": 7.8171, + "loss/crossentropy": 2.163187623023987, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24659747630357742, + "step": 2260 + }, + { + "epoch": 0.141375, + "grad_norm": 2.765625, + "grad_norm_var": 0.02261962890625, + "learning_rate": 0.0001, + "loss": 7.8513, + "loss/crossentropy": 2.2378615140914917, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.232786126434803, + "step": 2262 + }, + { + "epoch": 0.1415, + "grad_norm": 2.859375, + "grad_norm_var": 0.0192047119140625, + "learning_rate": 0.0001, + "loss": 7.9754, + "loss/crossentropy": 2.50004506111145, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24566112458705902, + "step": 2264 + }, + { + "epoch": 0.141625, + "grad_norm": 2.734375, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 7.9826, + "loss/crossentropy": 2.192861318588257, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.26260973513126373, + "step": 2266 + }, + { + "epoch": 0.14175, + "grad_norm": 2.6875, + "grad_norm_var": 0.0215240478515625, + "learning_rate": 0.0001, + "loss": 7.8327, + "loss/crossentropy": 2.3900744915008545, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23667296767234802, + "step": 2268 + }, + { + "epoch": 0.141875, + "grad_norm": 2.671875, + "grad_norm_var": 0.0174713134765625, + "learning_rate": 0.0001, + "loss": 7.9866, + "loss/crossentropy": 2.459189772605896, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2659824937582016, + "step": 2270 + }, + { + "epoch": 0.142, + "grad_norm": 2.765625, + "grad_norm_var": 0.0124176025390625, + "learning_rate": 0.0001, + "loss": 7.9221, + "loss/crossentropy": 2.3119730949401855, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2727499008178711, + "step": 2272 + }, + { + "epoch": 0.142125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0190093994140625, + "learning_rate": 0.0001, + "loss": 8.0506, + "loss/crossentropy": 2.065304398536682, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2462991625070572, + "step": 2274 + }, + { + "epoch": 0.14225, + "grad_norm": 2.484375, + "grad_norm_var": 0.022021484375, + "learning_rate": 0.0001, + "loss": 7.8896, + "loss/crossentropy": 2.293634057044983, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23517119884490967, + "step": 2276 + }, + { + "epoch": 0.142375, + "grad_norm": 2.640625, + "grad_norm_var": 0.0225250244140625, + "learning_rate": 0.0001, + "loss": 7.9233, + "loss/crossentropy": 2.193318486213684, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24228639900684357, + "step": 2278 + }, + { + "epoch": 0.1425, + "grad_norm": 2.90625, + "grad_norm_var": 0.0252593994140625, + "learning_rate": 0.0001, + "loss": 7.9282, + "loss/crossentropy": 2.3415223360061646, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.25495699793100357, + "step": 2280 + }, + { + "epoch": 0.142625, + "grad_norm": 2.546875, + "grad_norm_var": 0.027318318684895832, + "learning_rate": 0.0001, + "loss": 7.8591, + "loss/crossentropy": 1.9665740132331848, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2337390035390854, + "step": 2282 + }, + { + "epoch": 0.14275, + "grad_norm": 2.828125, + "grad_norm_var": 0.024193318684895833, + "learning_rate": 0.0001, + "loss": 7.7983, + "loss/crossentropy": 2.0722063779830933, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23800316452980042, + "step": 2284 + }, + { + "epoch": 0.142875, + "grad_norm": 2.59375, + "grad_norm_var": 0.025093587239583333, + "learning_rate": 0.0001, + "loss": 7.9698, + "loss/crossentropy": 2.3465652465820312, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2537553757429123, + "step": 2286 + }, + { + "epoch": 0.143, + "grad_norm": 3.375, + "grad_norm_var": 0.057938639322916666, + "learning_rate": 0.0001, + "loss": 7.8197, + "loss/crossentropy": 2.4646483659744263, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2531234845519066, + "step": 2288 + }, + { + "epoch": 0.143125, + "grad_norm": 2.359375, + "grad_norm_var": 0.06323140462239583, + "learning_rate": 0.0001, + "loss": 7.8108, + "loss/crossentropy": 2.255491614341736, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24858752638101578, + "step": 2290 + }, + { + "epoch": 0.14325, + "grad_norm": 2.953125, + "grad_norm_var": 0.06523030598958333, + "learning_rate": 0.0001, + "loss": 7.9138, + "loss/crossentropy": 2.0624001026153564, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23998911678791046, + "step": 2292 + }, + { + "epoch": 0.143375, + "grad_norm": 2.609375, + "grad_norm_var": 0.06442769368489583, + "learning_rate": 0.0001, + "loss": 7.9531, + "loss/crossentropy": 2.4339091777801514, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2569551467895508, + "step": 2294 + }, + { + "epoch": 0.1435, + "grad_norm": 2.59375, + "grad_norm_var": 0.06369527180989583, + "learning_rate": 0.0001, + "loss": 7.9041, + "loss/crossentropy": 2.0448151230812073, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2471894770860672, + "step": 2296 + }, + { + "epoch": 0.143625, + "grad_norm": 2.578125, + "grad_norm_var": 0.0640625, + "learning_rate": 0.0001, + "loss": 8.0666, + "loss/crossentropy": 2.415607452392578, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2683194726705551, + "step": 2298 + }, + { + "epoch": 0.14375, + "grad_norm": 2.8125, + "grad_norm_var": 0.061930338541666664, + "learning_rate": 0.0001, + "loss": 7.9572, + "loss/crossentropy": 2.294751286506653, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2594631314277649, + "step": 2300 + }, + { + "epoch": 0.143875, + "grad_norm": 2.8125, + "grad_norm_var": 0.06265869140625, + "learning_rate": 0.0001, + "loss": 7.7507, + "loss/crossentropy": 2.219098746776581, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2452593445777893, + "step": 2302 + }, + { + "epoch": 0.144, + "grad_norm": 2.6875, + "grad_norm_var": 0.024833170572916667, + "learning_rate": 0.0001, + "loss": 7.7452, + "loss/crossentropy": 2.273571014404297, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23514091968536377, + "step": 2304 + }, + { + "epoch": 0.144125, + "grad_norm": 2.8125, + "grad_norm_var": 0.028706868489583332, + "learning_rate": 0.0001, + "loss": 8.092, + "loss/crossentropy": 2.470995545387268, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2565620690584183, + "step": 2306 + }, + { + "epoch": 0.14425, + "grad_norm": 2.671875, + "grad_norm_var": 0.022508748372395835, + "learning_rate": 0.0001, + "loss": 8.0169, + "loss/crossentropy": 2.4442650079727173, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2491839900612831, + "step": 2308 + }, + { + "epoch": 0.144375, + "grad_norm": 2.515625, + "grad_norm_var": 0.024120076497395834, + "learning_rate": 0.0001, + "loss": 7.8243, + "loss/crossentropy": 2.3042315244674683, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24753264337778091, + "step": 2310 + }, + { + "epoch": 0.1445, + "grad_norm": 2.609375, + "grad_norm_var": 0.022835286458333333, + "learning_rate": 0.0001, + "loss": 7.8473, + "loss/crossentropy": 2.247647523880005, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.242111474275589, + "step": 2312 + }, + { + "epoch": 0.144625, + "grad_norm": 2.765625, + "grad_norm_var": 0.021142578125, + "learning_rate": 0.0001, + "loss": 7.8245, + "loss/crossentropy": 2.0948009490966797, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22648434340953827, + "step": 2314 + }, + { + "epoch": 0.14475, + "grad_norm": 2.578125, + "grad_norm_var": 0.024540201822916666, + "learning_rate": 0.0001, + "loss": 7.9081, + "loss/crossentropy": 2.3449655771255493, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.25433051586151123, + "step": 2316 + }, + { + "epoch": 0.144875, + "grad_norm": 2.40625, + "grad_norm_var": 0.03191731770833333, + "learning_rate": 0.0001, + "loss": 7.8692, + "loss/crossentropy": 2.1519018411636353, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23911522328853607, + "step": 2318 + }, + { + "epoch": 0.145, + "grad_norm": 3.046875, + "grad_norm_var": 0.03612874348958333, + "learning_rate": 0.0001, + "loss": 8.0635, + "loss/crossentropy": 2.0561267137527466, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23246531933546066, + "step": 2320 + }, + { + "epoch": 0.145125, + "grad_norm": 2.578125, + "grad_norm_var": 0.031248982747395834, + "learning_rate": 0.0001, + "loss": 8.0962, + "loss/crossentropy": 2.413579821586609, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2741449773311615, + "step": 2322 + }, + { + "epoch": 0.14525, + "grad_norm": 3.0, + "grad_norm_var": 0.03997395833333333, + "learning_rate": 0.0001, + "loss": 7.8067, + "loss/crossentropy": 2.215983271598816, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24622830748558044, + "step": 2324 + }, + { + "epoch": 0.145375, + "grad_norm": 2.484375, + "grad_norm_var": 0.04117431640625, + "learning_rate": 0.0001, + "loss": 7.9424, + "loss/crossentropy": 2.2071104049682617, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24660057574510574, + "step": 2326 + }, + { + "epoch": 0.1455, + "grad_norm": 2.890625, + "grad_norm_var": 0.0425689697265625, + "learning_rate": 0.0001, + "loss": 8.0007, + "loss/crossentropy": 2.1792843341827393, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2711133062839508, + "step": 2328 + }, + { + "epoch": 0.145625, + "grad_norm": 2.609375, + "grad_norm_var": 0.0416015625, + "learning_rate": 0.0001, + "loss": 7.9595, + "loss/crossentropy": 2.256834030151367, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.26283788681030273, + "step": 2330 + }, + { + "epoch": 0.14575, + "grad_norm": 2.390625, + "grad_norm_var": 0.04439697265625, + "learning_rate": 0.0001, + "loss": 7.7117, + "loss/crossentropy": 1.91128808259964, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23225348442792892, + "step": 2332 + }, + { + "epoch": 0.145875, + "grad_norm": 2.78125, + "grad_norm_var": 0.03492431640625, + "learning_rate": 0.0001, + "loss": 7.9979, + "loss/crossentropy": 2.1611289978027344, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2694767862558365, + "step": 2334 + }, + { + "epoch": 0.146, + "grad_norm": 2.796875, + "grad_norm_var": 0.029423014322916666, + "learning_rate": 0.0001, + "loss": 7.9771, + "loss/crossentropy": 2.3651944398880005, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24654380977153778, + "step": 2336 + }, + { + "epoch": 0.146125, + "grad_norm": 2.78125, + "grad_norm_var": 0.03247782389322917, + "learning_rate": 0.0001, + "loss": 8.0897, + "loss/crossentropy": 2.365579605102539, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.26178716123104095, + "step": 2338 + }, + { + "epoch": 0.14625, + "grad_norm": 2.875, + "grad_norm_var": 0.025487263997395832, + "learning_rate": 0.0001, + "loss": 7.8216, + "loss/crossentropy": 2.195146918296814, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2360374853014946, + "step": 2340 + }, + { + "epoch": 0.146375, + "grad_norm": 2.484375, + "grad_norm_var": 0.02750244140625, + "learning_rate": 0.0001, + "loss": 8.0719, + "loss/crossentropy": 2.6680378913879395, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2590227723121643, + "step": 2342 + }, + { + "epoch": 0.1465, + "grad_norm": 2.765625, + "grad_norm_var": 0.025227864583333332, + "learning_rate": 0.0001, + "loss": 7.9196, + "loss/crossentropy": 2.307919979095459, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2748124748468399, + "step": 2344 + }, + { + "epoch": 0.146625, + "grad_norm": 2.609375, + "grad_norm_var": 0.025581868489583333, + "learning_rate": 0.0001, + "loss": 8.0198, + "loss/crossentropy": 2.478832721710205, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25123097002506256, + "step": 2346 + }, + { + "epoch": 0.14675, + "grad_norm": 3.109375, + "grad_norm_var": 0.025715128580729166, + "learning_rate": 0.0001, + "loss": 7.998, + "loss/crossentropy": 2.1463791131973267, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2501315772533417, + "step": 2348 + }, + { + "epoch": 0.146875, + "grad_norm": 2.375, + "grad_norm_var": 0.0401519775390625, + "learning_rate": 0.0001, + "loss": 7.9402, + "loss/crossentropy": 2.287923812866211, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2529391869902611, + "step": 2350 + }, + { + "epoch": 0.147, + "grad_norm": 2.84375, + "grad_norm_var": 0.04274800618489583, + "learning_rate": 0.0001, + "loss": 7.8521, + "loss/crossentropy": 2.393770456314087, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23246632516384125, + "step": 2352 + }, + { + "epoch": 0.147125, + "grad_norm": 2.828125, + "grad_norm_var": 0.03935445149739583, + "learning_rate": 0.0001, + "loss": 7.9868, + "loss/crossentropy": 1.9886181354522705, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2457761988043785, + "step": 2354 + }, + { + "epoch": 0.14725, + "grad_norm": 2.734375, + "grad_norm_var": 0.0428375244140625, + "learning_rate": 0.0001, + "loss": 7.9631, + "loss/crossentropy": 2.1264249682426453, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.261639803647995, + "step": 2356 + }, + { + "epoch": 0.147375, + "grad_norm": 2.875, + "grad_norm_var": 0.0380767822265625, + "learning_rate": 0.0001, + "loss": 7.8278, + "loss/crossentropy": 2.0697389245033264, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.22177913784980774, + "step": 2358 + }, + { + "epoch": 0.1475, + "grad_norm": 2.859375, + "grad_norm_var": 0.0396484375, + "learning_rate": 0.0001, + "loss": 7.9523, + "loss/crossentropy": 2.355503797531128, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.257536381483078, + "step": 2360 + }, + { + "epoch": 0.147625, + "grad_norm": 2.484375, + "grad_norm_var": 0.0458984375, + "learning_rate": 0.0001, + "loss": 7.7195, + "loss/crossentropy": 2.263971447944641, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2459626868367195, + "step": 2362 + }, + { + "epoch": 0.14775, + "grad_norm": 2.578125, + "grad_norm_var": 0.034886678059895836, + "learning_rate": 0.0001, + "loss": 7.8342, + "loss/crossentropy": 2.3069592714309692, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.25156907737255096, + "step": 2364 + }, + { + "epoch": 0.147875, + "grad_norm": 2.890625, + "grad_norm_var": 0.025902303059895833, + "learning_rate": 0.0001, + "loss": 7.8323, + "loss/crossentropy": 2.343958616256714, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.23790227621793747, + "step": 2366 + }, + { + "epoch": 0.148, + "grad_norm": 2.6875, + "grad_norm_var": 0.02525634765625, + "learning_rate": 0.0001, + "loss": 8.0088, + "loss/crossentropy": 2.1950390338897705, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.25015532970428467, + "step": 2368 + }, + { + "epoch": 0.148125, + "grad_norm": 2.40625, + "grad_norm_var": 0.0327789306640625, + "learning_rate": 0.0001, + "loss": 7.7179, + "loss/crossentropy": 2.18330717086792, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23950626701116562, + "step": 2370 + }, + { + "epoch": 0.14825, + "grad_norm": 3.109375, + "grad_norm_var": 0.04156494140625, + "learning_rate": 0.0001, + "loss": 7.89, + "loss/crossentropy": 2.359447479248047, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.25177963823080063, + "step": 2372 + }, + { + "epoch": 0.148375, + "grad_norm": 2.921875, + "grad_norm_var": 0.048075358072916664, + "learning_rate": 0.0001, + "loss": 7.8616, + "loss/crossentropy": 2.1051629185676575, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.23466359078884125, + "step": 2374 + }, + { + "epoch": 0.1485, + "grad_norm": 2.921875, + "grad_norm_var": 0.05561421712239583, + "learning_rate": 0.0001, + "loss": 7.8136, + "loss/crossentropy": 2.4187822341918945, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24878371506929398, + "step": 2376 + }, + { + "epoch": 0.148625, + "grad_norm": 2.703125, + "grad_norm_var": 0.05022786458333333, + "learning_rate": 0.0001, + "loss": 7.8544, + "loss/crossentropy": 2.3044979572296143, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23919613659381866, + "step": 2378 + }, + { + "epoch": 0.14875, + "grad_norm": 2.515625, + "grad_norm_var": 0.05283203125, + "learning_rate": 0.0001, + "loss": 7.9553, + "loss/crossentropy": 2.334774613380432, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.26303067803382874, + "step": 2380 + }, + { + "epoch": 0.148875, + "grad_norm": 2.8125, + "grad_norm_var": 0.051268513997395834, + "learning_rate": 0.0001, + "loss": 7.8011, + "loss/crossentropy": 2.1512030363082886, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2616318315267563, + "step": 2382 + }, + { + "epoch": 0.149, + "grad_norm": 2.546875, + "grad_norm_var": 0.04903971354166667, + "learning_rate": 0.0001, + "loss": 7.9319, + "loss/crossentropy": 2.1831018924713135, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24666057527065277, + "step": 2384 + }, + { + "epoch": 0.149125, + "grad_norm": 3.125, + "grad_norm_var": 0.05806884765625, + "learning_rate": 0.0001, + "loss": 8.0404, + "loss/crossentropy": 2.278907299041748, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24930745363235474, + "step": 2386 + }, + { + "epoch": 0.14925, + "grad_norm": 2.46875, + "grad_norm_var": 0.04917704264322917, + "learning_rate": 0.0001, + "loss": 7.8234, + "loss/crossentropy": 2.117949962615967, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2466331273317337, + "step": 2388 + }, + { + "epoch": 0.149375, + "grad_norm": 3.21875, + "grad_norm_var": 0.05543212890625, + "learning_rate": 0.0001, + "loss": 7.9321, + "loss/crossentropy": 2.4539263248443604, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2619713842868805, + "step": 2390 + }, + { + "epoch": 0.1495, + "grad_norm": 2.703125, + "grad_norm_var": 0.0503082275390625, + "learning_rate": 0.0001, + "loss": 8.0366, + "loss/crossentropy": 2.155607759952545, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2557392567396164, + "step": 2392 + }, + { + "epoch": 0.149625, + "grad_norm": 2.828125, + "grad_norm_var": 0.0584625244140625, + "learning_rate": 0.0001, + "loss": 7.8701, + "loss/crossentropy": 2.226928472518921, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.227652445435524, + "step": 2394 + }, + { + "epoch": 0.14975, + "grad_norm": 2.53125, + "grad_norm_var": 0.05657552083333333, + "learning_rate": 0.0001, + "loss": 7.9392, + "loss/crossentropy": 2.152814030647278, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.22992898523807526, + "step": 2396 + }, + { + "epoch": 0.149875, + "grad_norm": 2.546875, + "grad_norm_var": 0.061258951822916664, + "learning_rate": 0.0001, + "loss": 7.877, + "loss/crossentropy": 2.0306188464164734, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2434871345758438, + "step": 2398 + }, + { + "epoch": 0.15, + "grad_norm": 2.90625, + "grad_norm_var": 0.057112630208333334, + "learning_rate": 0.0001, + "loss": 7.9455, + "loss/crossentropy": 2.3572858572006226, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24068891257047653, + "step": 2400 + }, + { + "epoch": 0.150125, + "grad_norm": 2.3125, + "grad_norm_var": 0.057112630208333334, + "learning_rate": 0.0001, + "loss": 7.8958, + "loss/crossentropy": 2.4808624982833862, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2544522359967232, + "step": 2402 + }, + { + "epoch": 0.15025, + "grad_norm": 2.765625, + "grad_norm_var": 0.055052693684895834, + "learning_rate": 0.0001, + "loss": 7.8099, + "loss/crossentropy": 2.0727401971817017, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.25555629283189774, + "step": 2404 + }, + { + "epoch": 0.150375, + "grad_norm": 2.921875, + "grad_norm_var": 0.04289957682291667, + "learning_rate": 0.0001, + "loss": 7.9492, + "loss/crossentropy": 2.3265267610549927, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2484055981040001, + "step": 2406 + }, + { + "epoch": 0.1505, + "grad_norm": 2.96875, + "grad_norm_var": 0.05078837076822917, + "learning_rate": 0.0001, + "loss": 7.9314, + "loss/crossentropy": 2.455584764480591, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2524856925010681, + "step": 2408 + }, + { + "epoch": 0.150625, + "grad_norm": 2.59375, + "grad_norm_var": 0.044694010416666666, + "learning_rate": 0.0001, + "loss": 7.8646, + "loss/crossentropy": 2.425659656524658, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2739051878452301, + "step": 2410 + }, + { + "epoch": 0.15075, + "grad_norm": 2.5625, + "grad_norm_var": 0.042967732747395834, + "learning_rate": 0.0001, + "loss": 7.9698, + "loss/crossentropy": 2.306099534034729, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24583810567855835, + "step": 2412 + }, + { + "epoch": 0.150875, + "grad_norm": 2.734375, + "grad_norm_var": 0.04340718587239583, + "learning_rate": 0.0001, + "loss": 7.7465, + "loss/crossentropy": 2.4911707639694214, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24410546571016312, + "step": 2414 + }, + { + "epoch": 0.151, + "grad_norm": 2.75, + "grad_norm_var": 0.04588114420572917, + "learning_rate": 0.0001, + "loss": 7.8045, + "loss/crossentropy": 2.461613178253174, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2660996913909912, + "step": 2416 + }, + { + "epoch": 0.151125, + "grad_norm": 2.328125, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 0.0001, + "loss": 7.7771, + "loss/crossentropy": 2.1200402975082397, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23757921904325485, + "step": 2418 + }, + { + "epoch": 0.15125, + "grad_norm": 2.625, + "grad_norm_var": 0.048029581705729164, + "learning_rate": 0.0001, + "loss": 7.8634, + "loss/crossentropy": 2.0594701766967773, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2412218227982521, + "step": 2420 + }, + { + "epoch": 0.151375, + "grad_norm": 3.390625, + "grad_norm_var": 0.09226786295572917, + "learning_rate": 0.0001, + "loss": 7.9832, + "loss/crossentropy": 2.171906590461731, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24810399115085602, + "step": 2422 + }, + { + "epoch": 0.1515, + "grad_norm": 2.328125, + "grad_norm_var": 0.09123942057291666, + "learning_rate": 0.0001, + "loss": 7.8273, + "loss/crossentropy": 2.5124112367630005, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2484402135014534, + "step": 2424 + }, + { + "epoch": 0.151625, + "grad_norm": 2.703125, + "grad_norm_var": 0.0913726806640625, + "learning_rate": 0.0001, + "loss": 7.782, + "loss/crossentropy": 2.3001959323883057, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24032824486494064, + "step": 2426 + }, + { + "epoch": 0.15175, + "grad_norm": 2.546875, + "grad_norm_var": 0.0932525634765625, + "learning_rate": 0.0001, + "loss": 7.7919, + "loss/crossentropy": 2.170712888240814, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.26182495057582855, + "step": 2428 + }, + { + "epoch": 0.151875, + "grad_norm": 2.84375, + "grad_norm_var": 0.093359375, + "learning_rate": 0.0001, + "loss": 7.9205, + "loss/crossentropy": 2.2728850841522217, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2512703761458397, + "step": 2430 + }, + { + "epoch": 0.152, + "grad_norm": 2.46875, + "grad_norm_var": 0.0877838134765625, + "learning_rate": 0.0001, + "loss": 7.7564, + "loss/crossentropy": 2.2336456775665283, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24472886323928833, + "step": 2432 + }, + { + "epoch": 0.152125, + "grad_norm": 2.75, + "grad_norm_var": 0.07998758951822917, + "learning_rate": 0.0001, + "loss": 7.792, + "loss/crossentropy": 2.0555617809295654, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.21662656217813492, + "step": 2434 + }, + { + "epoch": 0.15225, + "grad_norm": 2.6875, + "grad_norm_var": 0.07681376139322917, + "learning_rate": 0.0001, + "loss": 7.852, + "loss/crossentropy": 2.1744298934936523, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.25150124728679657, + "step": 2436 + }, + { + "epoch": 0.152375, + "grad_norm": 2.78125, + "grad_norm_var": 0.018342081705729166, + "learning_rate": 0.0001, + "loss": 7.7934, + "loss/crossentropy": 2.0990302562713623, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22628428786993027, + "step": 2438 + }, + { + "epoch": 0.1525, + "grad_norm": 2.671875, + "grad_norm_var": 0.011844889322916666, + "learning_rate": 0.0001, + "loss": 7.8081, + "loss/crossentropy": 2.2353204488754272, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26493804156780243, + "step": 2440 + }, + { + "epoch": 0.152625, + "grad_norm": 2.953125, + "grad_norm_var": 0.017704264322916666, + "learning_rate": 0.0001, + "loss": 8.0923, + "loss/crossentropy": 2.281570076942444, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2562706768512726, + "step": 2442 + }, + { + "epoch": 0.15275, + "grad_norm": 2.34375, + "grad_norm_var": 0.0250396728515625, + "learning_rate": 0.0001, + "loss": 7.773, + "loss/crossentropy": 2.251350522041321, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2373836562037468, + "step": 2444 + }, + { + "epoch": 0.152875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0528472900390625, + "learning_rate": 0.0001, + "loss": 7.976, + "loss/crossentropy": 2.2960145473480225, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.25969095528125763, + "step": 2446 + }, + { + "epoch": 0.153, + "grad_norm": 2.28125, + "grad_norm_var": 0.05984700520833333, + "learning_rate": 0.0001, + "loss": 7.8294, + "loss/crossentropy": 2.28712797164917, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2389351725578308, + "step": 2448 + }, + { + "epoch": 0.153125, + "grad_norm": 2.890625, + "grad_norm_var": 0.06357421875, + "learning_rate": 0.0001, + "loss": 7.9498, + "loss/crossentropy": 2.170054316520691, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24584101140499115, + "step": 2450 + }, + { + "epoch": 0.15325, + "grad_norm": 2.78125, + "grad_norm_var": 0.07652587890625, + "learning_rate": 0.0001, + "loss": 8.0025, + "loss/crossentropy": 2.2106114625930786, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24998192489147186, + "step": 2452 + }, + { + "epoch": 0.153375, + "grad_norm": 2.4375, + "grad_norm_var": 0.07939046223958333, + "learning_rate": 0.0001, + "loss": 7.7971, + "loss/crossentropy": 2.2459070682525635, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24219267070293427, + "step": 2454 + }, + { + "epoch": 0.1535, + "grad_norm": 2.59375, + "grad_norm_var": 0.08028971354166667, + "learning_rate": 0.0001, + "loss": 7.8238, + "loss/crossentropy": 2.3142576217651367, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25120753794908524, + "step": 2456 + }, + { + "epoch": 0.153625, + "grad_norm": 2.640625, + "grad_norm_var": 0.0760406494140625, + "learning_rate": 0.0001, + "loss": 7.9069, + "loss/crossentropy": 2.390681028366089, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2660007178783417, + "step": 2458 + }, + { + "epoch": 0.15375, + "grad_norm": 2.6875, + "grad_norm_var": 0.07095947265625, + "learning_rate": 0.0001, + "loss": 7.9599, + "loss/crossentropy": 2.3622519969940186, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2376810610294342, + "step": 2460 + }, + { + "epoch": 0.153875, + "grad_norm": 2.765625, + "grad_norm_var": 0.05569254557291667, + "learning_rate": 0.0001, + "loss": 7.7633, + "loss/crossentropy": 2.587849259376526, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2587762326002121, + "step": 2462 + }, + { + "epoch": 0.154, + "grad_norm": 2.5625, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 0.0001, + "loss": 8.0367, + "loss/crossentropy": 2.0119821429252625, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.23865149170160294, + "step": 2464 + }, + { + "epoch": 0.154125, + "grad_norm": 2.578125, + "grad_norm_var": 0.04401041666666667, + "learning_rate": 0.0001, + "loss": 7.9178, + "loss/crossentropy": 2.278993248939514, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2538345381617546, + "step": 2466 + }, + { + "epoch": 0.15425, + "grad_norm": 2.578125, + "grad_norm_var": 0.029352823893229168, + "learning_rate": 0.0001, + "loss": 7.8351, + "loss/crossentropy": 2.2865071296691895, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24299630522727966, + "step": 2468 + }, + { + "epoch": 0.154375, + "grad_norm": 2.796875, + "grad_norm_var": 0.027497355143229166, + "learning_rate": 0.0001, + "loss": 7.7031, + "loss/crossentropy": 2.118988275527954, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23056157678365707, + "step": 2470 + }, + { + "epoch": 0.1545, + "grad_norm": 2.25, + "grad_norm_var": 0.0398101806640625, + "learning_rate": 0.0001, + "loss": 7.7647, + "loss/crossentropy": 2.122451901435852, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2540634050965309, + "step": 2472 + }, + { + "epoch": 0.154625, + "grad_norm": 2.875, + "grad_norm_var": 0.041845703125, + "learning_rate": 0.0001, + "loss": 7.8108, + "loss/crossentropy": 2.2460381984710693, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2485114336013794, + "step": 2474 + }, + { + "epoch": 0.15475, + "grad_norm": 2.828125, + "grad_norm_var": 0.0369049072265625, + "learning_rate": 0.0001, + "loss": 7.8238, + "loss/crossentropy": 2.2321892976760864, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24692068994045258, + "step": 2476 + }, + { + "epoch": 0.154875, + "grad_norm": 2.671875, + "grad_norm_var": 0.0273834228515625, + "learning_rate": 0.0001, + "loss": 7.8107, + "loss/crossentropy": 2.47454035282135, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24419991672039032, + "step": 2478 + }, + { + "epoch": 0.155, + "grad_norm": 2.703125, + "grad_norm_var": 0.026399739583333335, + "learning_rate": 0.0001, + "loss": 7.782, + "loss/crossentropy": 2.1783688068389893, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24744782596826553, + "step": 2480 + }, + { + "epoch": 0.155125, + "grad_norm": 2.671875, + "grad_norm_var": 0.029524739583333334, + "learning_rate": 0.0001, + "loss": 7.9437, + "loss/crossentropy": 2.553811550140381, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2530565932393074, + "step": 2482 + }, + { + "epoch": 0.15525, + "grad_norm": 2.53125, + "grad_norm_var": 0.04010009765625, + "learning_rate": 0.0001, + "loss": 7.8713, + "loss/crossentropy": 2.441239356994629, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2551605701446533, + "step": 2484 + }, + { + "epoch": 0.155375, + "grad_norm": 2.5625, + "grad_norm_var": 0.038309733072916664, + "learning_rate": 0.0001, + "loss": 7.7061, + "loss/crossentropy": 2.370589256286621, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25826428830623627, + "step": 2486 + }, + { + "epoch": 0.1555, + "grad_norm": 2.640625, + "grad_norm_var": 0.026390584309895833, + "learning_rate": 0.0001, + "loss": 7.7961, + "loss/crossentropy": 2.1814417839050293, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2495630532503128, + "step": 2488 + }, + { + "epoch": 0.155625, + "grad_norm": 2.890625, + "grad_norm_var": 0.029173787434895834, + "learning_rate": 0.0001, + "loss": 7.8197, + "loss/crossentropy": 2.4883482456207275, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.26348088681697845, + "step": 2490 + }, + { + "epoch": 0.15575, + "grad_norm": 2.578125, + "grad_norm_var": 0.027632649739583334, + "learning_rate": 0.0001, + "loss": 7.8219, + "loss/crossentropy": 2.1873401403427124, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24557159841060638, + "step": 2492 + }, + { + "epoch": 0.155875, + "grad_norm": 2.75, + "grad_norm_var": 0.02867431640625, + "learning_rate": 0.0001, + "loss": 7.8562, + "loss/crossentropy": 2.3110402822494507, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24427203088998795, + "step": 2494 + }, + { + "epoch": 0.156, + "grad_norm": 2.953125, + "grad_norm_var": 0.03600260416666667, + "learning_rate": 0.0001, + "loss": 7.8415, + "loss/crossentropy": 2.2570624351501465, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.24605220556259155, + "step": 2496 + }, + { + "epoch": 0.156125, + "grad_norm": 2.3125, + "grad_norm_var": 0.04087626139322917, + "learning_rate": 0.0001, + "loss": 7.6643, + "loss/crossentropy": 2.2185534238815308, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2364451214671135, + "step": 2498 + }, + { + "epoch": 0.15625, + "grad_norm": 3.078125, + "grad_norm_var": 0.039948527018229166, + "learning_rate": 0.0001, + "loss": 7.8161, + "loss/crossentropy": 2.2274473905563354, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.24973751604557037, + "step": 2500 + }, + { + "epoch": 0.156375, + "grad_norm": 2.46875, + "grad_norm_var": 0.042292277018229164, + "learning_rate": 0.0001, + "loss": 8.0038, + "loss/crossentropy": 2.3013203144073486, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24737702310085297, + "step": 2502 + }, + { + "epoch": 0.1565, + "grad_norm": 2.484375, + "grad_norm_var": 0.0455078125, + "learning_rate": 0.0001, + "loss": 7.7928, + "loss/crossentropy": 2.5051584243774414, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.25412553548812866, + "step": 2504 + }, + { + "epoch": 0.156625, + "grad_norm": 2.890625, + "grad_norm_var": 0.0430084228515625, + "learning_rate": 0.0001, + "loss": 7.9481, + "loss/crossentropy": 2.189783751964569, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.24579951167106628, + "step": 2506 + }, + { + "epoch": 0.15675, + "grad_norm": 2.796875, + "grad_norm_var": 0.0482818603515625, + "learning_rate": 0.0001, + "loss": 7.8348, + "loss/crossentropy": 2.629545569419861, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.26841507852077484, + "step": 2508 + }, + { + "epoch": 0.156875, + "grad_norm": 2.34375, + "grad_norm_var": 0.060212198893229166, + "learning_rate": 0.0001, + "loss": 7.7123, + "loss/crossentropy": 2.267096519470215, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.24167770147323608, + "step": 2510 + }, + { + "epoch": 0.157, + "grad_norm": 2.546875, + "grad_norm_var": 0.05322977701822917, + "learning_rate": 0.0001, + "loss": 7.6795, + "loss/crossentropy": 2.170002818107605, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.2375175580382347, + "step": 2512 + }, + { + "epoch": 0.157125, + "grad_norm": 2.890625, + "grad_norm_var": 0.05366109212239583, + "learning_rate": 0.0001, + "loss": 7.8975, + "loss/crossentropy": 2.1839526891708374, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24612244218587875, + "step": 2514 + }, + { + "epoch": 0.15725, + "grad_norm": 2.484375, + "grad_norm_var": 0.0496734619140625, + "learning_rate": 0.0001, + "loss": 7.7503, + "loss/crossentropy": 1.7978224754333496, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.22572653740644455, + "step": 2516 + }, + { + "epoch": 0.157375, + "grad_norm": 2.9375, + "grad_norm_var": 0.052567545572916666, + "learning_rate": 0.0001, + "loss": 7.9403, + "loss/crossentropy": 2.2894846200942993, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24215564131736755, + "step": 2518 + }, + { + "epoch": 0.1575, + "grad_norm": 2.578125, + "grad_norm_var": 0.052469889322916664, + "learning_rate": 0.0001, + "loss": 8.0173, + "loss/crossentropy": 2.271655559539795, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23966332525014877, + "step": 2520 + }, + { + "epoch": 0.157625, + "grad_norm": 2.640625, + "grad_norm_var": 0.051070149739583334, + "learning_rate": 0.0001, + "loss": 8.027, + "loss/crossentropy": 2.30005145072937, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2628418430685997, + "step": 2522 + }, + { + "epoch": 0.15775, + "grad_norm": 2.65625, + "grad_norm_var": 0.03728841145833333, + "learning_rate": 0.0001, + "loss": 7.5896, + "loss/crossentropy": 2.0799155235290527, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.23638420552015305, + "step": 2524 + }, + { + "epoch": 0.157875, + "grad_norm": 2.859375, + "grad_norm_var": 0.03769124348958333, + "learning_rate": 0.0001, + "loss": 7.9816, + "loss/crossentropy": 2.13996684551239, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24061943590641022, + "step": 2526 + }, + { + "epoch": 0.158, + "grad_norm": 2.546875, + "grad_norm_var": 0.03584696451822917, + "learning_rate": 0.0001, + "loss": 7.89, + "loss/crossentropy": 2.4007346630096436, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2567130923271179, + "step": 2528 + }, + { + "epoch": 0.158125, + "grad_norm": 2.859375, + "grad_norm_var": 0.0336578369140625, + "learning_rate": 0.0001, + "loss": 7.8945, + "loss/crossentropy": 2.3631176948547363, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2415386661887169, + "step": 2530 + }, + { + "epoch": 0.15825, + "grad_norm": 2.5625, + "grad_norm_var": 0.02857666015625, + "learning_rate": 0.0001, + "loss": 7.7907, + "loss/crossentropy": 2.058986485004425, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24685738235712051, + "step": 2532 + }, + { + "epoch": 0.158375, + "grad_norm": 2.5, + "grad_norm_var": 0.02593994140625, + "learning_rate": 0.0001, + "loss": 7.8062, + "loss/crossentropy": 2.2281254529953003, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24689025431871414, + "step": 2534 + }, + { + "epoch": 0.1585, + "grad_norm": 2.5625, + "grad_norm_var": 0.023127237955729168, + "learning_rate": 0.0001, + "loss": 7.8028, + "loss/crossentropy": 2.4382470846176147, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.27194739878177643, + "step": 2536 + }, + { + "epoch": 0.158625, + "grad_norm": 2.96875, + "grad_norm_var": 0.028083292643229167, + "learning_rate": 0.0001, + "loss": 7.8523, + "loss/crossentropy": 2.2935184240341187, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.23303698748350143, + "step": 2538 + }, + { + "epoch": 0.15875, + "grad_norm": 2.578125, + "grad_norm_var": 0.0289703369140625, + "learning_rate": 0.0001, + "loss": 7.8038, + "loss/crossentropy": 2.43733286857605, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23855997622013092, + "step": 2540 + }, + { + "epoch": 0.158875, + "grad_norm": 2.796875, + "grad_norm_var": 0.020894368489583332, + "learning_rate": 0.0001, + "loss": 7.8183, + "loss/crossentropy": 2.318352222442627, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2344244346022606, + "step": 2542 + }, + { + "epoch": 0.159, + "grad_norm": 2.546875, + "grad_norm_var": 0.021468098958333334, + "learning_rate": 0.0001, + "loss": 7.7506, + "loss/crossentropy": 2.172747015953064, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23044558614492416, + "step": 2544 + }, + { + "epoch": 0.159125, + "grad_norm": 2.625, + "grad_norm_var": 0.017024739583333334, + "learning_rate": 0.0001, + "loss": 7.7647, + "loss/crossentropy": 2.2639771699905396, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2624407559633255, + "step": 2546 + }, + { + "epoch": 0.15925, + "grad_norm": 2.59375, + "grad_norm_var": 0.05847981770833333, + "learning_rate": 0.0001, + "loss": 7.8837, + "loss/crossentropy": 2.484058380126953, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2976628988981247, + "step": 2548 + }, + { + "epoch": 0.159375, + "grad_norm": 2.671875, + "grad_norm_var": 0.05621337890625, + "learning_rate": 0.0001, + "loss": 7.9374, + "loss/crossentropy": 2.439212441444397, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.23525572568178177, + "step": 2550 + }, + { + "epoch": 0.1595, + "grad_norm": 2.6875, + "grad_norm_var": 0.061009724934895836, + "learning_rate": 0.0001, + "loss": 7.6554, + "loss/crossentropy": 2.046541452407837, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.23553012311458588, + "step": 2552 + }, + { + "epoch": 0.159625, + "grad_norm": 2.65625, + "grad_norm_var": 0.058430989583333336, + "learning_rate": 0.0001, + "loss": 7.679, + "loss/crossentropy": 2.027937591075897, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24199260026216507, + "step": 2554 + }, + { + "epoch": 0.15975, + "grad_norm": 2.4375, + "grad_norm_var": 0.06506245930989583, + "learning_rate": 0.0001, + "loss": 7.6605, + "loss/crossentropy": 1.922214150428772, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.21722379326820374, + "step": 2556 + }, + { + "epoch": 0.159875, + "grad_norm": 2.796875, + "grad_norm_var": 0.06368815104166667, + "learning_rate": 0.0001, + "loss": 7.947, + "loss/crossentropy": 2.41329824924469, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23701953887939453, + "step": 2558 + }, + { + "epoch": 0.16, + "grad_norm": 2.625, + "grad_norm_var": 0.06347249348958334, + "learning_rate": 0.0001, + "loss": 7.9478, + "loss/crossentropy": 2.364239811897278, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.26914364099502563, + "step": 2560 + }, + { + "epoch": 0.160125, + "grad_norm": 2.53125, + "grad_norm_var": 0.06280924479166666, + "learning_rate": 0.0001, + "loss": 7.8012, + "loss/crossentropy": 2.2895541191101074, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2240670844912529, + "step": 2562 + }, + { + "epoch": 0.16025, + "grad_norm": 2.59375, + "grad_norm_var": 0.022858683268229166, + "learning_rate": 0.0001, + "loss": 7.9514, + "loss/crossentropy": 2.21063768863678, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23704807460308075, + "step": 2564 + }, + { + "epoch": 0.160375, + "grad_norm": 2.421875, + "grad_norm_var": 0.026102701822916668, + "learning_rate": 0.0001, + "loss": 7.9352, + "loss/crossentropy": 2.1890910863876343, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.25638218969106674, + "step": 2566 + }, + { + "epoch": 0.1605, + "grad_norm": 2.734375, + "grad_norm_var": 0.023014322916666666, + "learning_rate": 0.0001, + "loss": 7.7254, + "loss/crossentropy": 2.3455424308776855, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22738848626613617, + "step": 2568 + }, + { + "epoch": 0.160625, + "grad_norm": 2.640625, + "grad_norm_var": 0.018830362955729166, + "learning_rate": 0.0001, + "loss": 7.9244, + "loss/crossentropy": 2.4760804176330566, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2477440983057022, + "step": 2570 + }, + { + "epoch": 0.16075, + "grad_norm": 2.78125, + "grad_norm_var": 0.0157379150390625, + "learning_rate": 0.0001, + "loss": 7.8971, + "loss/crossentropy": 2.2207542657852173, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23943090438842773, + "step": 2572 + }, + { + "epoch": 0.160875, + "grad_norm": 2.796875, + "grad_norm_var": 0.0168609619140625, + "learning_rate": 0.0001, + "loss": 7.8168, + "loss/crossentropy": 2.5181933641433716, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2479179948568344, + "step": 2574 + }, + { + "epoch": 0.161, + "grad_norm": 2.75, + "grad_norm_var": 0.015721638997395832, + "learning_rate": 0.0001, + "loss": 7.7453, + "loss/crossentropy": 2.37592613697052, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24624846875667572, + "step": 2576 + }, + { + "epoch": 0.161125, + "grad_norm": 3.15625, + "grad_norm_var": 0.030204264322916667, + "learning_rate": 0.0001, + "loss": 7.8473, + "loss/crossentropy": 2.2562596797943115, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2466205209493637, + "step": 2578 + }, + { + "epoch": 0.16125, + "grad_norm": 2.625, + "grad_norm_var": 0.029866536458333332, + "learning_rate": 0.0001, + "loss": 7.8813, + "loss/crossentropy": 2.23412823677063, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2418230101466179, + "step": 2580 + }, + { + "epoch": 0.161375, + "grad_norm": 2.625, + "grad_norm_var": 0.023030598958333332, + "learning_rate": 0.0001, + "loss": 7.9116, + "loss/crossentropy": 2.529879093170166, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2786422669887543, + "step": 2582 + }, + { + "epoch": 0.1615, + "grad_norm": 2.515625, + "grad_norm_var": 0.0252593994140625, + "learning_rate": 0.0001, + "loss": 7.7615, + "loss/crossentropy": 2.1202937364578247, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24669316411018372, + "step": 2584 + }, + { + "epoch": 0.161625, + "grad_norm": 2.59375, + "grad_norm_var": 0.03206278483072917, + "learning_rate": 0.0001, + "loss": 7.7169, + "loss/crossentropy": 2.2816332578659058, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23847512155771255, + "step": 2586 + }, + { + "epoch": 0.16175, + "grad_norm": 2.828125, + "grad_norm_var": 0.032136027018229166, + "learning_rate": 0.0001, + "loss": 7.7928, + "loss/crossentropy": 2.264583945274353, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.23760029673576355, + "step": 2588 + }, + { + "epoch": 0.161875, + "grad_norm": 2.4375, + "grad_norm_var": 0.03433837890625, + "learning_rate": 0.0001, + "loss": 7.8559, + "loss/crossentropy": 2.070025682449341, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2591887414455414, + "step": 2590 + }, + { + "epoch": 0.162, + "grad_norm": 2.453125, + "grad_norm_var": 0.039388020833333336, + "learning_rate": 0.0001, + "loss": 7.8661, + "loss/crossentropy": 2.3117960691452026, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.23791569471359253, + "step": 2592 + }, + { + "epoch": 0.162125, + "grad_norm": 2.5625, + "grad_norm_var": 0.0252593994140625, + "learning_rate": 0.0001, + "loss": 7.4415, + "loss/crossentropy": 2.2540050745010376, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22730688750743866, + "step": 2594 + }, + { + "epoch": 0.16225, + "grad_norm": 2.703125, + "grad_norm_var": 0.03404947916666667, + "learning_rate": 0.0001, + "loss": 7.6522, + "loss/crossentropy": 1.7815396785736084, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2086879387497902, + "step": 2596 + }, + { + "epoch": 0.162375, + "grad_norm": 2.625, + "grad_norm_var": 0.03359375, + "learning_rate": 0.0001, + "loss": 7.7545, + "loss/crossentropy": 2.38827908039093, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24543824791908264, + "step": 2598 + }, + { + "epoch": 0.1625, + "grad_norm": 2.84375, + "grad_norm_var": 0.03720703125, + "learning_rate": 0.0001, + "loss": 7.8079, + "loss/crossentropy": 2.4880915880203247, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2978169023990631, + "step": 2600 + }, + { + "epoch": 0.162625, + "grad_norm": 2.515625, + "grad_norm_var": 0.03308817545572917, + "learning_rate": 0.0001, + "loss": 7.8877, + "loss/crossentropy": 2.4218918085098267, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24140693247318268, + "step": 2602 + }, + { + "epoch": 0.16275, + "grad_norm": 2.53125, + "grad_norm_var": 0.031053670247395835, + "learning_rate": 0.0001, + "loss": 7.8144, + "loss/crossentropy": 2.266621947288513, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26293135434389114, + "step": 2604 + }, + { + "epoch": 0.162875, + "grad_norm": 2.578125, + "grad_norm_var": 0.028499348958333334, + "learning_rate": 0.0001, + "loss": 7.7038, + "loss/crossentropy": 2.1036359071731567, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2307727113366127, + "step": 2606 + }, + { + "epoch": 0.163, + "grad_norm": 2.6875, + "grad_norm_var": 0.04426981608072917, + "learning_rate": 0.0001, + "loss": 7.8715, + "loss/crossentropy": 2.2452776432037354, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26242855191230774, + "step": 2608 + }, + { + "epoch": 0.163125, + "grad_norm": 2.46875, + "grad_norm_var": 0.04185791015625, + "learning_rate": 0.0001, + "loss": 7.8903, + "loss/crossentropy": 2.2557637691497803, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.272522896528244, + "step": 2610 + }, + { + "epoch": 0.16325, + "grad_norm": 2.65625, + "grad_norm_var": 0.03326416015625, + "learning_rate": 0.0001, + "loss": 7.9233, + "loss/crossentropy": 2.375272512435913, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2328314259648323, + "step": 2612 + }, + { + "epoch": 0.163375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03937886555989583, + "learning_rate": 0.0001, + "loss": 7.8084, + "loss/crossentropy": 2.2634716033935547, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2391228973865509, + "step": 2614 + }, + { + "epoch": 0.1635, + "grad_norm": 2.75, + "grad_norm_var": 0.04064127604166667, + "learning_rate": 0.0001, + "loss": 7.888, + "loss/crossentropy": 2.5287814140319824, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2443351447582245, + "step": 2616 + }, + { + "epoch": 0.163625, + "grad_norm": 2.375, + "grad_norm_var": 0.04646708170572917, + "learning_rate": 0.0001, + "loss": 7.6805, + "loss/crossentropy": 2.142694592475891, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24597708880901337, + "step": 2618 + }, + { + "epoch": 0.16375, + "grad_norm": 3.25, + "grad_norm_var": 0.06467997233072917, + "learning_rate": 0.0001, + "loss": 7.8287, + "loss/crossentropy": 1.996739685535431, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25455768406391144, + "step": 2620 + }, + { + "epoch": 0.163875, + "grad_norm": 2.40625, + "grad_norm_var": 0.06575419108072916, + "learning_rate": 0.0001, + "loss": 7.8684, + "loss/crossentropy": 2.460938572883606, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.25081367045640945, + "step": 2622 + }, + { + "epoch": 0.164, + "grad_norm": 2.828125, + "grad_norm_var": 0.05423177083333333, + "learning_rate": 0.0001, + "loss": 7.8841, + "loss/crossentropy": 2.4802383184432983, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24186843633651733, + "step": 2624 + }, + { + "epoch": 0.164125, + "grad_norm": 2.609375, + "grad_norm_var": 0.05076395670572917, + "learning_rate": 0.0001, + "loss": 7.8454, + "loss/crossentropy": 2.209325075149536, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.237510085105896, + "step": 2626 + }, + { + "epoch": 0.16425, + "grad_norm": 2.28125, + "grad_norm_var": 0.06262613932291666, + "learning_rate": 0.0001, + "loss": 7.7406, + "loss/crossentropy": 2.0401015281677246, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.20650795102119446, + "step": 2628 + }, + { + "epoch": 0.164375, + "grad_norm": 2.890625, + "grad_norm_var": 0.06002604166666667, + "learning_rate": 0.0001, + "loss": 7.894, + "loss/crossentropy": 2.5026817321777344, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24748124927282333, + "step": 2630 + }, + { + "epoch": 0.1645, + "grad_norm": 2.34375, + "grad_norm_var": 0.0602691650390625, + "learning_rate": 0.0001, + "loss": 7.7024, + "loss/crossentropy": 2.2391568422317505, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.22500373423099518, + "step": 2632 + }, + { + "epoch": 0.164625, + "grad_norm": 2.578125, + "grad_norm_var": 0.05440165201822917, + "learning_rate": 0.0001, + "loss": 7.7876, + "loss/crossentropy": 2.2685747742652893, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2358316034078598, + "step": 2634 + }, + { + "epoch": 0.16475, + "grad_norm": 2.703125, + "grad_norm_var": 0.027220662434895834, + "learning_rate": 0.0001, + "loss": 7.7126, + "loss/crossentropy": 2.2785946130752563, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24026557803153992, + "step": 2636 + }, + { + "epoch": 0.164875, + "grad_norm": 2.671875, + "grad_norm_var": 0.024898274739583334, + "learning_rate": 0.0001, + "loss": 7.8153, + "loss/crossentropy": 2.2721141576766968, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2398277372121811, + "step": 2638 + }, + { + "epoch": 0.165, + "grad_norm": 2.5625, + "grad_norm_var": 0.023224894205729166, + "learning_rate": 0.0001, + "loss": 7.9432, + "loss/crossentropy": 2.229793667793274, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.21887121349573135, + "step": 2640 + }, + { + "epoch": 0.165125, + "grad_norm": 2.4375, + "grad_norm_var": 0.023876953125, + "learning_rate": 0.0001, + "loss": 7.603, + "loss/crossentropy": 2.1890532970428467, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23123417794704437, + "step": 2642 + }, + { + "epoch": 0.16525, + "grad_norm": 2.53125, + "grad_norm_var": 0.0223052978515625, + "learning_rate": 0.0001, + "loss": 7.6917, + "loss/crossentropy": 2.172744870185852, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22291851788759232, + "step": 2644 + }, + { + "epoch": 0.165375, + "grad_norm": 2.75, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 0.0001, + "loss": 7.6527, + "loss/crossentropy": 2.2864272594451904, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24596457928419113, + "step": 2646 + }, + { + "epoch": 0.1655, + "grad_norm": 2.53125, + "grad_norm_var": 0.01695556640625, + "learning_rate": 0.0001, + "loss": 7.858, + "loss/crossentropy": 2.428277611732483, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.25081363320350647, + "step": 2648 + }, + { + "epoch": 0.165625, + "grad_norm": 2.671875, + "grad_norm_var": 0.017508951822916667, + "learning_rate": 0.0001, + "loss": 7.8198, + "loss/crossentropy": 2.2622756958007812, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24526074528694153, + "step": 2650 + }, + { + "epoch": 0.16575, + "grad_norm": 2.453125, + "grad_norm_var": 0.0265289306640625, + "learning_rate": 0.0001, + "loss": 7.6316, + "loss/crossentropy": 2.1581307649612427, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24160084128379822, + "step": 2652 + }, + { + "epoch": 0.165875, + "grad_norm": 2.78125, + "grad_norm_var": 0.0283203125, + "learning_rate": 0.0001, + "loss": 7.8289, + "loss/crossentropy": 2.2180778980255127, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24087932705879211, + "step": 2654 + }, + { + "epoch": 0.166, + "grad_norm": 2.796875, + "grad_norm_var": 0.030939737955729168, + "learning_rate": 0.0001, + "loss": 7.8398, + "loss/crossentropy": 2.3046650886535645, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.25179338455200195, + "step": 2656 + }, + { + "epoch": 0.166125, + "grad_norm": 2.796875, + "grad_norm_var": 0.02769775390625, + "learning_rate": 0.0001, + "loss": 7.7888, + "loss/crossentropy": 2.251810073852539, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23383210599422455, + "step": 2658 + }, + { + "epoch": 0.16625, + "grad_norm": 2.515625, + "grad_norm_var": 0.0210357666015625, + "learning_rate": 0.0001, + "loss": 7.6108, + "loss/crossentropy": 2.326148748397827, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.267700731754303, + "step": 2660 + }, + { + "epoch": 0.166375, + "grad_norm": 2.640625, + "grad_norm_var": 0.021410115559895835, + "learning_rate": 0.0001, + "loss": 7.6615, + "loss/crossentropy": 2.3175487518310547, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2537999600172043, + "step": 2662 + }, + { + "epoch": 0.1665, + "grad_norm": 2.609375, + "grad_norm_var": 0.019310506184895833, + "learning_rate": 0.0001, + "loss": 7.7319, + "loss/crossentropy": 2.4394067525863647, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24439330399036407, + "step": 2664 + }, + { + "epoch": 0.166625, + "grad_norm": 2.5, + "grad_norm_var": 0.024934895833333335, + "learning_rate": 0.0001, + "loss": 7.7306, + "loss/crossentropy": 2.377542495727539, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2411457523703575, + "step": 2666 + }, + { + "epoch": 0.16675, + "grad_norm": 2.96875, + "grad_norm_var": 0.026097615559895832, + "learning_rate": 0.0001, + "loss": 7.7291, + "loss/crossentropy": 2.244265556335449, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2636634260416031, + "step": 2668 + }, + { + "epoch": 0.166875, + "grad_norm": 2.40625, + "grad_norm_var": 0.029618326822916666, + "learning_rate": 0.0001, + "loss": 7.817, + "loss/crossentropy": 2.3067715167999268, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2648337334394455, + "step": 2670 + }, + { + "epoch": 0.167, + "grad_norm": 2.78125, + "grad_norm_var": 0.02916259765625, + "learning_rate": 0.0001, + "loss": 7.9386, + "loss/crossentropy": 2.3284155130386353, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25726044178009033, + "step": 2672 + }, + { + "epoch": 0.167125, + "grad_norm": 2.609375, + "grad_norm_var": 0.03279520670572917, + "learning_rate": 0.0001, + "loss": 7.6504, + "loss/crossentropy": 2.1608939170837402, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23987659811973572, + "step": 2674 + }, + { + "epoch": 0.16725, + "grad_norm": 3.0625, + "grad_norm_var": 0.04436442057291667, + "learning_rate": 0.0001, + "loss": 7.7824, + "loss/crossentropy": 2.156682312488556, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22545601427555084, + "step": 2676 + }, + { + "epoch": 0.167375, + "grad_norm": 2.765625, + "grad_norm_var": 0.07141520182291666, + "learning_rate": 0.0001, + "loss": 7.8573, + "loss/crossentropy": 2.1365780234336853, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.21754685044288635, + "step": 2678 + }, + { + "epoch": 0.1675, + "grad_norm": 2.484375, + "grad_norm_var": 0.07617899576822916, + "learning_rate": 0.0001, + "loss": 7.9647, + "loss/crossentropy": 2.27071213722229, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22438976168632507, + "step": 2680 + }, + { + "epoch": 0.167625, + "grad_norm": 2.765625, + "grad_norm_var": 0.0685455322265625, + "learning_rate": 0.0001, + "loss": 7.7773, + "loss/crossentropy": 2.115522563457489, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.22477930784225464, + "step": 2682 + }, + { + "epoch": 0.16775, + "grad_norm": 2.46875, + "grad_norm_var": 0.06461181640625, + "learning_rate": 0.0001, + "loss": 7.7755, + "loss/crossentropy": 2.215229034423828, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.22990095615386963, + "step": 2684 + }, + { + "epoch": 0.167875, + "grad_norm": 2.484375, + "grad_norm_var": 0.06243082682291667, + "learning_rate": 0.0001, + "loss": 7.7615, + "loss/crossentropy": 2.180325746536255, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.232216514647007, + "step": 2686 + }, + { + "epoch": 0.168, + "grad_norm": 2.453125, + "grad_norm_var": 0.0634185791015625, + "learning_rate": 0.0001, + "loss": 7.713, + "loss/crossentropy": 2.4484771490097046, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2512357458472252, + "step": 2688 + }, + { + "epoch": 0.168125, + "grad_norm": 2.5625, + "grad_norm_var": 0.0561431884765625, + "learning_rate": 0.0001, + "loss": 7.7087, + "loss/crossentropy": 2.178835153579712, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24023275077342987, + "step": 2690 + }, + { + "epoch": 0.16825, + "grad_norm": 2.53125, + "grad_norm_var": 0.045750935872395836, + "learning_rate": 0.0001, + "loss": 7.6464, + "loss/crossentropy": 2.0620937943458557, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23334022611379623, + "step": 2692 + }, + { + "epoch": 0.168375, + "grad_norm": 2.53125, + "grad_norm_var": 0.009740193684895834, + "learning_rate": 0.0001, + "loss": 7.8639, + "loss/crossentropy": 2.4087116718292236, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2341567426919937, + "step": 2694 + }, + { + "epoch": 0.1685, + "grad_norm": 2.890625, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 7.9106, + "loss/crossentropy": 2.309167981147766, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24564050883054733, + "step": 2696 + }, + { + "epoch": 0.168625, + "grad_norm": 2.28125, + "grad_norm_var": 0.016364542643229167, + "learning_rate": 0.0001, + "loss": 7.6039, + "loss/crossentropy": 1.9676810503005981, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2102392315864563, + "step": 2698 + }, + { + "epoch": 0.16875, + "grad_norm": 2.65625, + "grad_norm_var": 0.017118326822916665, + "learning_rate": 0.0001, + "loss": 7.7754, + "loss/crossentropy": 2.380856513977051, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2278568521142006, + "step": 2700 + }, + { + "epoch": 0.168875, + "grad_norm": 2.828125, + "grad_norm_var": 0.020335896809895834, + "learning_rate": 0.0001, + "loss": 7.8029, + "loss/crossentropy": 2.2248435020446777, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23989464342594147, + "step": 2702 + }, + { + "epoch": 0.169, + "grad_norm": 2.65625, + "grad_norm_var": 0.0283355712890625, + "learning_rate": 0.0001, + "loss": 7.7909, + "loss/crossentropy": 2.2802765369415283, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23772113770246506, + "step": 2704 + }, + { + "epoch": 0.169125, + "grad_norm": 2.640625, + "grad_norm_var": 0.02998046875, + "learning_rate": 0.0001, + "loss": 7.8392, + "loss/crossentropy": 2.320490837097168, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.232273131608963, + "step": 2706 + }, + { + "epoch": 0.16925, + "grad_norm": 2.34375, + "grad_norm_var": 0.036279296875, + "learning_rate": 0.0001, + "loss": 7.7277, + "loss/crossentropy": 2.103710889816284, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.21974950283765793, + "step": 2708 + }, + { + "epoch": 0.169375, + "grad_norm": 2.90625, + "grad_norm_var": 0.0538726806640625, + "learning_rate": 0.0001, + "loss": 7.9849, + "loss/crossentropy": 2.311343789100647, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.27306586503982544, + "step": 2710 + }, + { + "epoch": 0.1695, + "grad_norm": 4.21875, + "grad_norm_var": 0.21113993326822916, + "learning_rate": 0.0001, + "loss": 7.898, + "loss/crossentropy": 2.248456120491028, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23485350608825684, + "step": 2712 + }, + { + "epoch": 0.169625, + "grad_norm": 3.640625, + "grad_norm_var": 0.2821116129557292, + "learning_rate": 0.0001, + "loss": 8.0907, + "loss/crossentropy": 2.3589508533477783, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.23614779859781265, + "step": 2714 + }, + { + "epoch": 0.16975, + "grad_norm": 2.9375, + "grad_norm_var": 0.29221903483072914, + "learning_rate": 0.0001, + "loss": 7.8759, + "loss/crossentropy": 2.472365975379944, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23715022206306458, + "step": 2716 + }, + { + "epoch": 0.169875, + "grad_norm": 2.328125, + "grad_norm_var": 0.3374420166015625, + "learning_rate": 0.0001, + "loss": 7.7942, + "loss/crossentropy": 2.264963388442993, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24661238491535187, + "step": 2718 + }, + { + "epoch": 0.17, + "grad_norm": 2.828125, + "grad_norm_var": 0.3333943684895833, + "learning_rate": 0.0001, + "loss": 7.9196, + "loss/crossentropy": 2.477718949317932, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25135859847068787, + "step": 2720 + }, + { + "epoch": 0.170125, + "grad_norm": 2.40625, + "grad_norm_var": 0.3395792643229167, + "learning_rate": 0.0001, + "loss": 7.8171, + "loss/crossentropy": 2.2033207416534424, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24111789464950562, + "step": 2722 + }, + { + "epoch": 0.17025, + "grad_norm": 2.671875, + "grad_norm_var": 0.3217844645182292, + "learning_rate": 0.0001, + "loss": 7.7115, + "loss/crossentropy": 2.3668792247772217, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2309272140264511, + "step": 2724 + }, + { + "epoch": 0.170375, + "grad_norm": 2.609375, + "grad_norm_var": 0.32696940104166666, + "learning_rate": 0.0001, + "loss": 7.807, + "loss/crossentropy": 2.359344244003296, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.25724074244499207, + "step": 2726 + }, + { + "epoch": 0.1705, + "grad_norm": 2.4375, + "grad_norm_var": 0.21409505208333332, + "learning_rate": 0.0001, + "loss": 7.7598, + "loss/crossentropy": 2.2009201049804688, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.23280736804008484, + "step": 2728 + }, + { + "epoch": 0.170625, + "grad_norm": 2.59375, + "grad_norm_var": 0.1070465087890625, + "learning_rate": 0.0001, + "loss": 7.8024, + "loss/crossentropy": 2.1781824827194214, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.23048165440559387, + "step": 2730 + }, + { + "epoch": 0.17075, + "grad_norm": 2.734375, + "grad_norm_var": 0.041890462239583336, + "learning_rate": 0.0001, + "loss": 7.867, + "loss/crossentropy": 2.2078417539596558, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24096353352069855, + "step": 2732 + }, + { + "epoch": 0.170875, + "grad_norm": 2.640625, + "grad_norm_var": 0.03166910807291667, + "learning_rate": 0.0001, + "loss": 7.8545, + "loss/crossentropy": 2.3574694395065308, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2407936155796051, + "step": 2734 + }, + { + "epoch": 0.171, + "grad_norm": 2.5, + "grad_norm_var": 0.026285807291666668, + "learning_rate": 0.0001, + "loss": 7.7336, + "loss/crossentropy": 2.3674607276916504, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2338072955608368, + "step": 2736 + }, + { + "epoch": 0.171125, + "grad_norm": 2.453125, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 7.7552, + "loss/crossentropy": 2.1523255109786987, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23672043532133102, + "step": 2738 + }, + { + "epoch": 0.17125, + "grad_norm": 2.375, + "grad_norm_var": 0.026471964518229165, + "learning_rate": 0.0001, + "loss": 7.6336, + "loss/crossentropy": 2.0955100655555725, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2361309677362442, + "step": 2740 + }, + { + "epoch": 0.171375, + "grad_norm": 3.140625, + "grad_norm_var": 0.036432902018229164, + "learning_rate": 0.0001, + "loss": 7.8829, + "loss/crossentropy": 2.509047269821167, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2578771486878395, + "step": 2742 + }, + { + "epoch": 0.1715, + "grad_norm": 2.484375, + "grad_norm_var": 0.03332926432291667, + "learning_rate": 0.0001, + "loss": 7.7991, + "loss/crossentropy": 2.2338361740112305, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.26055432856082916, + "step": 2744 + }, + { + "epoch": 0.171625, + "grad_norm": 2.59375, + "grad_norm_var": 0.033967081705729166, + "learning_rate": 0.0001, + "loss": 7.7437, + "loss/crossentropy": 2.1647003889083862, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23635494709014893, + "step": 2746 + }, + { + "epoch": 0.17175, + "grad_norm": 2.8125, + "grad_norm_var": 0.03599344889322917, + "learning_rate": 0.0001, + "loss": 7.7051, + "loss/crossentropy": 2.561861991882324, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2592911720275879, + "step": 2748 + }, + { + "epoch": 0.171875, + "grad_norm": 2.390625, + "grad_norm_var": 0.03669331868489583, + "learning_rate": 0.0001, + "loss": 7.6254, + "loss/crossentropy": 2.082680583000183, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.21699509769678116, + "step": 2750 + }, + { + "epoch": 0.172, + "grad_norm": 6.8125, + "grad_norm_var": 1.1539052327473958, + "learning_rate": 0.0001, + "loss": 7.8237, + "loss/crossentropy": 2.5276395082473755, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2522757425904274, + "step": 2752 + }, + { + "epoch": 0.172125, + "grad_norm": 5.1875, + "grad_norm_var": 6.189676920572917, + "learning_rate": 0.0001, + "loss": 8.0084, + "loss/crossentropy": 2.2055013179779053, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23886322230100632, + "step": 2754 + }, + { + "epoch": 0.17225, + "grad_norm": 2.375, + "grad_norm_var": 6.181050618489583, + "learning_rate": 0.0001, + "loss": 7.9047, + "loss/crossentropy": 2.394223690032959, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2515896260738373, + "step": 2756 + }, + { + "epoch": 0.172375, + "grad_norm": 2.90625, + "grad_norm_var": 6.165526326497396, + "learning_rate": 0.0001, + "loss": 7.9735, + "loss/crossentropy": 2.2563817501068115, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23824837803840637, + "step": 2758 + }, + { + "epoch": 0.1725, + "grad_norm": 2.484375, + "grad_norm_var": 6.159468587239584, + "learning_rate": 0.0001, + "loss": 7.8461, + "loss/crossentropy": 2.244271457195282, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.22762248665094376, + "step": 2760 + }, + { + "epoch": 0.172625, + "grad_norm": 2.609375, + "grad_norm_var": 6.164518229166666, + "learning_rate": 0.0001, + "loss": 7.7406, + "loss/crossentropy": 2.183770179748535, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.21703775227069855, + "step": 2762 + }, + { + "epoch": 0.17275, + "grad_norm": 3.5625, + "grad_norm_var": 6.166910807291667, + "learning_rate": 0.0001, + "loss": 7.9424, + "loss/crossentropy": 2.1734741926193237, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2613416016101837, + "step": 2764 + }, + { + "epoch": 0.172875, + "grad_norm": 2.859375, + "grad_norm_var": 6.048502604166667, + "learning_rate": 0.0001, + "loss": 7.9174, + "loss/crossentropy": 2.32150936126709, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23922567069530487, + "step": 2766 + }, + { + "epoch": 0.173, + "grad_norm": 2.65625, + "grad_norm_var": 5.403804524739583, + "learning_rate": 0.0001, + "loss": 7.6948, + "loss/crossentropy": 2.0935378074645996, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23874947428703308, + "step": 2768 + }, + { + "epoch": 0.173125, + "grad_norm": 2.703125, + "grad_norm_var": 0.39658915201822914, + "learning_rate": 0.0001, + "loss": 7.9079, + "loss/crossentropy": 2.2263898253440857, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24427123367786407, + "step": 2770 + }, + { + "epoch": 0.17325, + "grad_norm": 2.625, + "grad_norm_var": 0.37922261555989584, + "learning_rate": 0.0001, + "loss": 7.8895, + "loss/crossentropy": 2.1437301635742188, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.22952381521463394, + "step": 2772 + }, + { + "epoch": 0.173375, + "grad_norm": 2.4375, + "grad_norm_var": 0.40615946451822915, + "learning_rate": 0.0001, + "loss": 7.8372, + "loss/crossentropy": 2.293881416320801, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2393340766429901, + "step": 2774 + }, + { + "epoch": 0.1735, + "grad_norm": 2.640625, + "grad_norm_var": 0.4012980143229167, + "learning_rate": 0.0001, + "loss": 7.7442, + "loss/crossentropy": 2.258249878883362, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23926617950201035, + "step": 2776 + }, + { + "epoch": 0.173625, + "grad_norm": 3.953125, + "grad_norm_var": 0.4626210530598958, + "learning_rate": 0.0001, + "loss": 7.7726, + "loss/crossentropy": 2.0506081581115723, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2070065289735794, + "step": 2778 + }, + { + "epoch": 0.17375, + "grad_norm": 2.5625, + "grad_norm_var": 0.14410400390625, + "learning_rate": 0.0001, + "loss": 7.7289, + "loss/crossentropy": 2.130703330039978, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23245615512132645, + "step": 2780 + }, + { + "epoch": 0.173875, + "grad_norm": 2.578125, + "grad_norm_var": 0.1447906494140625, + "learning_rate": 0.0001, + "loss": 7.8126, + "loss/crossentropy": 2.336124062538147, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2596941590309143, + "step": 2782 + }, + { + "epoch": 0.174, + "grad_norm": 2.484375, + "grad_norm_var": 0.13846028645833333, + "learning_rate": 0.0001, + "loss": 7.7806, + "loss/crossentropy": 2.313853621482849, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24538902938365936, + "step": 2784 + }, + { + "epoch": 0.174125, + "grad_norm": 2.796875, + "grad_norm_var": 0.13967997233072918, + "learning_rate": 0.0001, + "loss": 7.7471, + "loss/crossentropy": 2.2884416580200195, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24766312539577484, + "step": 2786 + }, + { + "epoch": 0.17425, + "grad_norm": 2.609375, + "grad_norm_var": 0.14446512858072916, + "learning_rate": 0.0001, + "loss": 7.8361, + "loss/crossentropy": 2.3857977390289307, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2376108318567276, + "step": 2788 + }, + { + "epoch": 0.174375, + "grad_norm": 2.46875, + "grad_norm_var": 0.14230855305989584, + "learning_rate": 0.0001, + "loss": 7.7924, + "loss/crossentropy": 2.2684574127197266, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.25091154873371124, + "step": 2790 + }, + { + "epoch": 0.1745, + "grad_norm": 2.671875, + "grad_norm_var": 0.1416168212890625, + "learning_rate": 0.0001, + "loss": 7.7769, + "loss/crossentropy": 2.2278876304626465, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2252519577741623, + "step": 2792 + }, + { + "epoch": 0.174625, + "grad_norm": 2.4375, + "grad_norm_var": 0.03281962076822917, + "learning_rate": 0.0001, + "loss": 7.5981, + "loss/crossentropy": 2.2489346265792847, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23500560969114304, + "step": 2794 + }, + { + "epoch": 0.17475, + "grad_norm": 2.5, + "grad_norm_var": 0.022248331705729166, + "learning_rate": 0.0001, + "loss": 7.7545, + "loss/crossentropy": 2.295899510383606, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2576214596629143, + "step": 2796 + }, + { + "epoch": 0.174875, + "grad_norm": 2.46875, + "grad_norm_var": 0.0241851806640625, + "learning_rate": 0.0001, + "loss": 7.8388, + "loss/crossentropy": 2.3035519123077393, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2654409855604172, + "step": 2798 + }, + { + "epoch": 0.175, + "grad_norm": 3.015625, + "grad_norm_var": 0.037060546875, + "learning_rate": 0.0001, + "loss": 7.9235, + "loss/crossentropy": 2.2352579832077026, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2841232195496559, + "step": 2800 + }, + { + "epoch": 0.175125, + "grad_norm": 2.890625, + "grad_norm_var": 0.046483357747395836, + "learning_rate": 0.0001, + "loss": 7.709, + "loss/crossentropy": 2.3198055028915405, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23578546196222305, + "step": 2802 + }, + { + "epoch": 0.17525, + "grad_norm": 3.15625, + "grad_norm_var": 0.05718994140625, + "learning_rate": 0.0001, + "loss": 7.8402, + "loss/crossentropy": 2.2245510816574097, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23328936845064163, + "step": 2804 + }, + { + "epoch": 0.175375, + "grad_norm": 2.703125, + "grad_norm_var": 0.13108317057291666, + "learning_rate": 0.0001, + "loss": 7.7222, + "loss/crossentropy": 2.3215973377227783, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2323600873351097, + "step": 2806 + }, + { + "epoch": 0.1755, + "grad_norm": 2.53125, + "grad_norm_var": 0.13945210774739583, + "learning_rate": 0.0001, + "loss": 7.9641, + "loss/crossentropy": 2.6317641735076904, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25842827558517456, + "step": 2808 + }, + { + "epoch": 0.175625, + "grad_norm": 2.46875, + "grad_norm_var": 0.13935139973958333, + "learning_rate": 0.0001, + "loss": 7.82, + "loss/crossentropy": 2.215041160583496, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.25014058500528336, + "step": 2810 + }, + { + "epoch": 0.17575, + "grad_norm": 2.4375, + "grad_norm_var": 0.13599853515625, + "learning_rate": 0.0001, + "loss": 7.7395, + "loss/crossentropy": 2.1867226362228394, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23424651473760605, + "step": 2812 + }, + { + "epoch": 0.175875, + "grad_norm": 2.28125, + "grad_norm_var": 0.1615631103515625, + "learning_rate": 0.0001, + "loss": 7.5309, + "loss/crossentropy": 2.217566967010498, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.226304829120636, + "step": 2814 + }, + { + "epoch": 0.176, + "grad_norm": 2.875, + "grad_norm_var": 0.15181884765625, + "learning_rate": 0.0001, + "loss": 7.759, + "loss/crossentropy": 2.0618727803230286, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2439548224210739, + "step": 2816 + }, + { + "epoch": 0.176125, + "grad_norm": 3.515625, + "grad_norm_var": 0.6506795247395833, + "learning_rate": 0.0001, + "loss": 7.8753, + "loss/crossentropy": 2.409805655479431, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2505335807800293, + "step": 2818 + }, + { + "epoch": 0.17625, + "grad_norm": 3.03125, + "grad_norm_var": 0.6482248942057292, + "learning_rate": 0.0001, + "loss": 7.5632, + "loss/crossentropy": 2.03094744682312, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.21356388926506042, + "step": 2820 + }, + { + "epoch": 0.176375, + "grad_norm": 2.265625, + "grad_norm_var": 0.61324462890625, + "learning_rate": 0.0001, + "loss": 7.6257, + "loss/crossentropy": 2.3141270875930786, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.25053539127111435, + "step": 2822 + }, + { + "epoch": 0.1765, + "grad_norm": 2.71875, + "grad_norm_var": 0.60732421875, + "learning_rate": 0.0001, + "loss": 7.8661, + "loss/crossentropy": 2.174069106578827, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.22884509712457657, + "step": 2824 + }, + { + "epoch": 0.176625, + "grad_norm": 2.625, + "grad_norm_var": 0.6008626302083333, + "learning_rate": 0.0001, + "loss": 7.5315, + "loss/crossentropy": 2.08145010471344, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.22816456109285355, + "step": 2826 + }, + { + "epoch": 0.17675, + "grad_norm": 2.375, + "grad_norm_var": 0.6063313802083333, + "learning_rate": 0.0001, + "loss": 7.6201, + "loss/crossentropy": 2.094637870788574, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2243911251425743, + "step": 2828 + }, + { + "epoch": 0.176875, + "grad_norm": 2.53125, + "grad_norm_var": 0.5699940999348958, + "learning_rate": 0.0001, + "loss": 7.7378, + "loss/crossentropy": 2.1945928931236267, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2399984896183014, + "step": 2830 + }, + { + "epoch": 0.177, + "grad_norm": 2.5, + "grad_norm_var": 0.5867421468098958, + "learning_rate": 0.0001, + "loss": 7.6547, + "loss/crossentropy": 2.3012553453445435, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.23795722424983978, + "step": 2832 + }, + { + "epoch": 0.177125, + "grad_norm": 2.5, + "grad_norm_var": 0.0286773681640625, + "learning_rate": 0.0001, + "loss": 7.6384, + "loss/crossentropy": 2.424581289291382, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2470892071723938, + "step": 2834 + }, + { + "epoch": 0.17725, + "grad_norm": 2.671875, + "grad_norm_var": 0.0132720947265625, + "learning_rate": 0.0001, + "loss": 7.7292, + "loss/crossentropy": 2.136981964111328, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23704643547534943, + "step": 2836 + }, + { + "epoch": 0.177375, + "grad_norm": 2.421875, + "grad_norm_var": 0.01051025390625, + "learning_rate": 0.0001, + "loss": 7.8595, + "loss/crossentropy": 2.285550355911255, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23677106201648712, + "step": 2838 + }, + { + "epoch": 0.1775, + "grad_norm": 2.859375, + "grad_norm_var": 0.015262858072916666, + "learning_rate": 0.0001, + "loss": 7.6297, + "loss/crossentropy": 2.1494513750076294, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2412027269601822, + "step": 2840 + }, + { + "epoch": 0.177625, + "grad_norm": 3.0, + "grad_norm_var": 0.026984659830729167, + "learning_rate": 0.0001, + "loss": 7.7216, + "loss/crossentropy": 2.605257034301758, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25743308663368225, + "step": 2842 + }, + { + "epoch": 0.17775, + "grad_norm": 2.5, + "grad_norm_var": 0.028539021809895832, + "learning_rate": 0.0001, + "loss": 7.6461, + "loss/crossentropy": 2.0102819204330444, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2197662517428398, + "step": 2844 + }, + { + "epoch": 0.177875, + "grad_norm": 2.6875, + "grad_norm_var": 0.028564453125, + "learning_rate": 0.0001, + "loss": 7.8794, + "loss/crossentropy": 2.154181718826294, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23210670053958893, + "step": 2846 + }, + { + "epoch": 0.178, + "grad_norm": 2.421875, + "grad_norm_var": 0.0362945556640625, + "learning_rate": 0.0001, + "loss": 7.8658, + "loss/crossentropy": 2.0595306158065796, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.26342423260211945, + "step": 2848 + }, + { + "epoch": 0.178125, + "grad_norm": 2.578125, + "grad_norm_var": 0.03465067545572917, + "learning_rate": 0.0001, + "loss": 7.5678, + "loss/crossentropy": 2.179203987121582, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.22831647098064423, + "step": 2850 + }, + { + "epoch": 0.17825, + "grad_norm": 2.875, + "grad_norm_var": 0.0369781494140625, + "learning_rate": 0.0001, + "loss": 7.6801, + "loss/crossentropy": 2.0674314498901367, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23797215521335602, + "step": 2852 + }, + { + "epoch": 0.178375, + "grad_norm": 2.765625, + "grad_norm_var": 0.0592681884765625, + "learning_rate": 0.0001, + "loss": 7.9571, + "loss/crossentropy": 2.3534988164901733, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.26077476143836975, + "step": 2854 + }, + { + "epoch": 0.1785, + "grad_norm": 2.578125, + "grad_norm_var": 0.07317708333333334, + "learning_rate": 0.0001, + "loss": 7.6968, + "loss/crossentropy": 2.10899019241333, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.22398869693279266, + "step": 2856 + }, + { + "epoch": 0.178625, + "grad_norm": 2.4375, + "grad_norm_var": 0.07069905598958333, + "learning_rate": 0.0001, + "loss": 7.8299, + "loss/crossentropy": 2.319381833076477, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24246351420879364, + "step": 2858 + }, + { + "epoch": 0.17875, + "grad_norm": 2.484375, + "grad_norm_var": 0.07053934733072917, + "learning_rate": 0.0001, + "loss": 7.6359, + "loss/crossentropy": 2.083498954772949, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.23692196607589722, + "step": 2860 + }, + { + "epoch": 0.178875, + "grad_norm": 2.359375, + "grad_norm_var": 0.0753814697265625, + "learning_rate": 0.0001, + "loss": 7.8099, + "loss/crossentropy": 2.1947683095932007, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.24239200353622437, + "step": 2862 + }, + { + "epoch": 0.179, + "grad_norm": 2.671875, + "grad_norm_var": 0.06674702962239583, + "learning_rate": 0.0001, + "loss": 7.7765, + "loss/crossentropy": 2.209209442138672, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.24729400128126144, + "step": 2864 + }, + { + "epoch": 0.179125, + "grad_norm": 2.53125, + "grad_norm_var": 0.0657135009765625, + "learning_rate": 0.0001, + "loss": 7.8832, + "loss/crossentropy": 2.3722325563430786, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.24714642018079758, + "step": 2866 + }, + { + "epoch": 0.17925, + "grad_norm": 2.578125, + "grad_norm_var": 0.07121988932291666, + "learning_rate": 0.0001, + "loss": 7.7811, + "loss/crossentropy": 2.4134687185287476, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2480832040309906, + "step": 2868 + }, + { + "epoch": 0.179375, + "grad_norm": 2.625, + "grad_norm_var": 0.0347320556640625, + "learning_rate": 0.0001, + "loss": 7.7289, + "loss/crossentropy": 2.2061760425567627, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23551230877637863, + "step": 2870 + }, + { + "epoch": 0.1795, + "grad_norm": 2.5625, + "grad_norm_var": 0.030790201822916665, + "learning_rate": 0.0001, + "loss": 7.8462, + "loss/crossentropy": 2.436152458190918, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2384696677327156, + "step": 2872 + }, + { + "epoch": 0.179625, + "grad_norm": 2.375, + "grad_norm_var": 0.03289388020833333, + "learning_rate": 0.0001, + "loss": 7.7489, + "loss/crossentropy": 2.3130866289138794, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.27254779636859894, + "step": 2874 + }, + { + "epoch": 0.17975, + "grad_norm": 2.828125, + "grad_norm_var": 0.0348052978515625, + "learning_rate": 0.0001, + "loss": 7.8657, + "loss/crossentropy": 2.294617772102356, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23688946664333344, + "step": 2876 + }, + { + "epoch": 0.179875, + "grad_norm": 2.5625, + "grad_norm_var": 0.030497233072916668, + "learning_rate": 0.0001, + "loss": 7.7343, + "loss/crossentropy": 2.3553664684295654, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2647460997104645, + "step": 2878 + }, + { + "epoch": 0.18, + "grad_norm": 2.734375, + "grad_norm_var": 0.031180826822916667, + "learning_rate": 0.0001, + "loss": 7.7173, + "loss/crossentropy": 2.3102493286132812, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23434799164533615, + "step": 2880 + }, + { + "epoch": 0.180125, + "grad_norm": 2.5, + "grad_norm_var": 0.030378214518229165, + "learning_rate": 0.0001, + "loss": 7.7913, + "loss/crossentropy": 2.298587918281555, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.22284646332263947, + "step": 2882 + }, + { + "epoch": 0.18025, + "grad_norm": 2.78125, + "grad_norm_var": 0.024128214518229166, + "learning_rate": 0.0001, + "loss": 7.6415, + "loss/crossentropy": 2.1285579204559326, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.23009717464447021, + "step": 2884 + }, + { + "epoch": 0.180375, + "grad_norm": 2.546875, + "grad_norm_var": 0.021419270833333334, + "learning_rate": 0.0001, + "loss": 7.7626, + "loss/crossentropy": 2.1065726280212402, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.20703819394111633, + "step": 2886 + }, + { + "epoch": 0.1805, + "grad_norm": 2.734375, + "grad_norm_var": 0.0209625244140625, + "learning_rate": 0.0001, + "loss": 7.8513, + "loss/crossentropy": 2.2261340618133545, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25797754526138306, + "step": 2888 + }, + { + "epoch": 0.180625, + "grad_norm": 2.421875, + "grad_norm_var": 0.0176910400390625, + "learning_rate": 0.0001, + "loss": 7.8405, + "loss/crossentropy": 2.5347334146499634, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.25483807921409607, + "step": 2890 + }, + { + "epoch": 0.18075, + "grad_norm": 2.5625, + "grad_norm_var": 0.016141764322916665, + "learning_rate": 0.0001, + "loss": 7.8018, + "loss/crossentropy": 2.2389495372772217, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2264159545302391, + "step": 2892 + }, + { + "epoch": 0.180875, + "grad_norm": 2.453125, + "grad_norm_var": 0.017552693684895832, + "learning_rate": 0.0001, + "loss": 7.8118, + "loss/crossentropy": 2.2665982246398926, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24241438508033752, + "step": 2894 + }, + { + "epoch": 0.181, + "grad_norm": 2.5625, + "grad_norm_var": 0.017430623372395832, + "learning_rate": 0.0001, + "loss": 7.7171, + "loss/crossentropy": 2.0678027272224426, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21813754737377167, + "step": 2896 + }, + { + "epoch": 0.181125, + "grad_norm": 2.59375, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 7.7695, + "loss/crossentropy": 2.3117023706436157, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.23616492748260498, + "step": 2898 + }, + { + "epoch": 0.18125, + "grad_norm": 2.5625, + "grad_norm_var": 0.016434733072916666, + "learning_rate": 0.0001, + "loss": 7.8045, + "loss/crossentropy": 2.30752170085907, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.22399520874023438, + "step": 2900 + }, + { + "epoch": 0.181375, + "grad_norm": 2.421875, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 8.0111, + "loss/crossentropy": 2.441025972366333, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.25707243382930756, + "step": 2902 + }, + { + "epoch": 0.1815, + "grad_norm": 2.421875, + "grad_norm_var": 0.012848917643229167, + "learning_rate": 0.0001, + "loss": 7.6732, + "loss/crossentropy": 2.146459937095642, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24353434145450592, + "step": 2904 + }, + { + "epoch": 0.181625, + "grad_norm": 2.8125, + "grad_norm_var": 0.017609659830729166, + "learning_rate": 0.0001, + "loss": 7.7333, + "loss/crossentropy": 2.4673913717269897, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2522246688604355, + "step": 2906 + }, + { + "epoch": 0.18175, + "grad_norm": 2.40625, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 7.7385, + "loss/crossentropy": 2.1044957637786865, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.235269233584404, + "step": 2908 + }, + { + "epoch": 0.181875, + "grad_norm": 2.328125, + "grad_norm_var": 0.022347005208333333, + "learning_rate": 0.0001, + "loss": 7.4037, + "loss/crossentropy": 2.2339547872543335, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2398391216993332, + "step": 2910 + }, + { + "epoch": 0.182, + "grad_norm": 2.6875, + "grad_norm_var": 0.0226715087890625, + "learning_rate": 0.0001, + "loss": 7.5718, + "loss/crossentropy": 2.1950976848602295, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23091710358858109, + "step": 2912 + }, + { + "epoch": 0.182125, + "grad_norm": 2.640625, + "grad_norm_var": 0.02310791015625, + "learning_rate": 0.0001, + "loss": 7.6153, + "loss/crossentropy": 2.3033652305603027, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26021257042884827, + "step": 2914 + }, + { + "epoch": 0.18225, + "grad_norm": 2.4375, + "grad_norm_var": 0.017838541666666666, + "learning_rate": 0.0001, + "loss": 7.8996, + "loss/crossentropy": 2.4616453647613525, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2583332806825638, + "step": 2916 + }, + { + "epoch": 0.182375, + "grad_norm": 2.484375, + "grad_norm_var": 0.018456013997395833, + "learning_rate": 0.0001, + "loss": 7.6534, + "loss/crossentropy": 2.3851388692855835, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24298951029777527, + "step": 2918 + }, + { + "epoch": 0.1825, + "grad_norm": 2.515625, + "grad_norm_var": 0.016890462239583334, + "learning_rate": 0.0001, + "loss": 7.6927, + "loss/crossentropy": 2.409003496170044, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.22873351722955704, + "step": 2920 + }, + { + "epoch": 0.182625, + "grad_norm": 2.828125, + "grad_norm_var": 0.03720703125, + "learning_rate": 0.0001, + "loss": 7.7708, + "loss/crossentropy": 2.233977437019348, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.250872403383255, + "step": 2922 + }, + { + "epoch": 0.18275, + "grad_norm": 2.296875, + "grad_norm_var": 0.03955790201822917, + "learning_rate": 0.0001, + "loss": 7.8246, + "loss/crossentropy": 2.2683684825897217, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2577130198478699, + "step": 2924 + }, + { + "epoch": 0.182875, + "grad_norm": 2.734375, + "grad_norm_var": 0.04023030598958333, + "learning_rate": 0.0001, + "loss": 7.7134, + "loss/crossentropy": 2.2642526626586914, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25480280816555023, + "step": 2926 + }, + { + "epoch": 0.183, + "grad_norm": 2.546875, + "grad_norm_var": 0.03713785807291667, + "learning_rate": 0.0001, + "loss": 7.8294, + "loss/crossentropy": 2.176198959350586, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.22762203961610794, + "step": 2928 + }, + { + "epoch": 0.183125, + "grad_norm": 2.65625, + "grad_norm_var": 0.037007649739583336, + "learning_rate": 0.0001, + "loss": 7.7678, + "loss/crossentropy": 2.5741217136383057, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24319174140691757, + "step": 2930 + }, + { + "epoch": 0.18325, + "grad_norm": 2.765625, + "grad_norm_var": 0.03632405598958333, + "learning_rate": 0.0001, + "loss": 7.744, + "loss/crossentropy": 2.4425946474075317, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2534085810184479, + "step": 2932 + }, + { + "epoch": 0.183375, + "grad_norm": 2.65625, + "grad_norm_var": 0.0351470947265625, + "learning_rate": 0.0001, + "loss": 7.6372, + "loss/crossentropy": 2.136154890060425, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23775418102741241, + "step": 2934 + }, + { + "epoch": 0.1835, + "grad_norm": 2.5, + "grad_norm_var": 0.035380045572916664, + "learning_rate": 0.0001, + "loss": 7.8286, + "loss/crossentropy": 2.249310851097107, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.24401143193244934, + "step": 2936 + }, + { + "epoch": 0.183625, + "grad_norm": 2.71875, + "grad_norm_var": 0.018912760416666667, + "learning_rate": 0.0001, + "loss": 7.8273, + "loss/crossentropy": 2.3503148555755615, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24899056553840637, + "step": 2938 + }, + { + "epoch": 0.18375, + "grad_norm": 2.546875, + "grad_norm_var": 0.012923177083333333, + "learning_rate": 0.0001, + "loss": 7.5477, + "loss/crossentropy": 2.147356152534485, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24195496737957, + "step": 2940 + }, + { + "epoch": 0.183875, + "grad_norm": 2.3125, + "grad_norm_var": 0.017496744791666668, + "learning_rate": 0.0001, + "loss": 7.6031, + "loss/crossentropy": 2.314660429954529, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24584651738405228, + "step": 2942 + }, + { + "epoch": 0.184, + "grad_norm": 2.796875, + "grad_norm_var": 0.020829264322916666, + "learning_rate": 0.0001, + "loss": 7.6413, + "loss/crossentropy": 2.0859320759773254, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2166888415813446, + "step": 2944 + }, + { + "epoch": 0.184125, + "grad_norm": 2.234375, + "grad_norm_var": 0.028837076822916665, + "learning_rate": 0.0001, + "loss": 7.7158, + "loss/crossentropy": 2.305862069129944, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23023030161857605, + "step": 2946 + }, + { + "epoch": 0.18425, + "grad_norm": 2.609375, + "grad_norm_var": 0.03004150390625, + "learning_rate": 0.0001, + "loss": 7.635, + "loss/crossentropy": 2.345908284187317, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.22712141275405884, + "step": 2948 + }, + { + "epoch": 0.184375, + "grad_norm": 2.90625, + "grad_norm_var": 0.03779296875, + "learning_rate": 0.0001, + "loss": 7.7227, + "loss/crossentropy": 1.9320513010025024, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.20210829377174377, + "step": 2950 + }, + { + "epoch": 0.1845, + "grad_norm": 2.265625, + "grad_norm_var": 0.04317118326822917, + "learning_rate": 0.0001, + "loss": 7.6749, + "loss/crossentropy": 2.2975679636001587, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.23657388985157013, + "step": 2952 + }, + { + "epoch": 0.184625, + "grad_norm": 2.6875, + "grad_norm_var": 0.04095052083333333, + "learning_rate": 0.0001, + "loss": 7.7486, + "loss/crossentropy": 2.2370001077651978, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.24035517871379852, + "step": 2954 + }, + { + "epoch": 0.18475, + "grad_norm": 2.46875, + "grad_norm_var": 0.046418253580729166, + "learning_rate": 0.0001, + "loss": 7.77, + "loss/crossentropy": 2.3344703912734985, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.24448612332344055, + "step": 2956 + }, + { + "epoch": 0.184875, + "grad_norm": 2.671875, + "grad_norm_var": 0.0432037353515625, + "learning_rate": 0.0001, + "loss": 7.7287, + "loss/crossentropy": 2.3184186220169067, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24355477839708328, + "step": 2958 + }, + { + "epoch": 0.185, + "grad_norm": 2.671875, + "grad_norm_var": 0.04150390625, + "learning_rate": 0.0001, + "loss": 7.793, + "loss/crossentropy": 2.4449312686920166, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24685372412204742, + "step": 2960 + }, + { + "epoch": 0.185125, + "grad_norm": 2.5, + "grad_norm_var": 0.0351226806640625, + "learning_rate": 0.0001, + "loss": 7.7295, + "loss/crossentropy": 2.0395036935806274, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22259121388196945, + "step": 2962 + }, + { + "epoch": 0.18525, + "grad_norm": 2.5625, + "grad_norm_var": 0.0314453125, + "learning_rate": 0.0001, + "loss": 7.6614, + "loss/crossentropy": 2.029510021209717, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22326287627220154, + "step": 2964 + }, + { + "epoch": 0.185375, + "grad_norm": 2.65625, + "grad_norm_var": 0.02506103515625, + "learning_rate": 0.0001, + "loss": 7.6142, + "loss/crossentropy": 2.2890524864196777, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.22059021890163422, + "step": 2966 + }, + { + "epoch": 0.1855, + "grad_norm": 2.578125, + "grad_norm_var": 0.021434529622395834, + "learning_rate": 0.0001, + "loss": 7.7904, + "loss/crossentropy": 2.2007906436920166, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.22886135429143906, + "step": 2968 + }, + { + "epoch": 0.185625, + "grad_norm": 2.578125, + "grad_norm_var": 0.016097005208333334, + "learning_rate": 0.0001, + "loss": 7.7165, + "loss/crossentropy": 2.4090301990509033, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24099014699459076, + "step": 2970 + }, + { + "epoch": 0.18575, + "grad_norm": 2.34375, + "grad_norm_var": 0.014225260416666666, + "learning_rate": 0.0001, + "loss": 7.7287, + "loss/crossentropy": 2.358201503753662, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2252245992422104, + "step": 2972 + }, + { + "epoch": 0.185875, + "grad_norm": 2.71875, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 7.623, + "loss/crossentropy": 2.3175272941589355, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23873913288116455, + "step": 2974 + }, + { + "epoch": 0.186, + "grad_norm": 2.5625, + "grad_norm_var": 0.0131988525390625, + "learning_rate": 0.0001, + "loss": 7.6915, + "loss/crossentropy": 2.590595841407776, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24967636168003082, + "step": 2976 + }, + { + "epoch": 0.186125, + "grad_norm": 2.484375, + "grad_norm_var": 0.015306599934895833, + "learning_rate": 0.0001, + "loss": 7.6723, + "loss/crossentropy": 2.2069579362869263, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.22214916348457336, + "step": 2978 + }, + { + "epoch": 0.18625, + "grad_norm": 2.65625, + "grad_norm_var": 0.015729777018229165, + "learning_rate": 0.0001, + "loss": 7.8361, + "loss/crossentropy": 2.3302581310272217, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2437615841627121, + "step": 2980 + }, + { + "epoch": 0.186375, + "grad_norm": 2.453125, + "grad_norm_var": 0.014232381184895834, + "learning_rate": 0.0001, + "loss": 7.6333, + "loss/crossentropy": 2.2672786712646484, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.22940023988485336, + "step": 2982 + }, + { + "epoch": 0.1865, + "grad_norm": 2.65625, + "grad_norm_var": 0.014290364583333333, + "learning_rate": 0.0001, + "loss": 7.7551, + "loss/crossentropy": 2.528477191925049, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24014821648597717, + "step": 2984 + }, + { + "epoch": 0.186625, + "grad_norm": 2.796875, + "grad_norm_var": 0.0172515869140625, + "learning_rate": 0.0001, + "loss": 7.6699, + "loss/crossentropy": 2.134658455848694, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2487226352095604, + "step": 2986 + }, + { + "epoch": 0.18675, + "grad_norm": 2.34375, + "grad_norm_var": 0.0167388916015625, + "learning_rate": 0.0001, + "loss": 7.6979, + "loss/crossentropy": 2.3620848655700684, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24307234585285187, + "step": 2988 + }, + { + "epoch": 0.186875, + "grad_norm": 2.671875, + "grad_norm_var": 0.015250651041666667, + "learning_rate": 0.0001, + "loss": 7.7099, + "loss/crossentropy": 2.4233195781707764, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2325136736035347, + "step": 2990 + }, + { + "epoch": 0.187, + "grad_norm": 2.515625, + "grad_norm_var": 0.015462239583333334, + "learning_rate": 0.0001, + "loss": 7.6485, + "loss/crossentropy": 2.2925750017166138, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24893249571323395, + "step": 2992 + }, + { + "epoch": 0.187125, + "grad_norm": 4.125, + "grad_norm_var": 0.16298421223958334, + "learning_rate": 0.0001, + "loss": 7.7527, + "loss/crossentropy": 2.1467760801315308, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23566482961177826, + "step": 2994 + }, + { + "epoch": 0.18725, + "grad_norm": 2.4375, + "grad_norm_var": 0.16752827962239583, + "learning_rate": 0.0001, + "loss": 7.7408, + "loss/crossentropy": 2.103184461593628, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2323935329914093, + "step": 2996 + }, + { + "epoch": 0.187375, + "grad_norm": 2.65625, + "grad_norm_var": 0.16705322265625, + "learning_rate": 0.0001, + "loss": 7.7494, + "loss/crossentropy": 2.174781620502472, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25046705454587936, + "step": 2998 + }, + { + "epoch": 0.1875, + "grad_norm": 2.625, + "grad_norm_var": 0.16780192057291668, + "learning_rate": 0.0001, + "loss": 7.7174, + "loss/crossentropy": 2.2673741579055786, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.22653405368328094, + "step": 3000 + }, + { + "epoch": 0.187625, + "grad_norm": 3.296875, + "grad_norm_var": 0.19199117024739584, + "learning_rate": 0.0001, + "loss": 7.7319, + "loss/crossentropy": 2.157706141471863, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2287752330303192, + "step": 3002 + }, + { + "epoch": 0.18775, + "grad_norm": 3.0625, + "grad_norm_var": 0.2681223551432292, + "learning_rate": 0.0001, + "loss": 7.8417, + "loss/crossentropy": 2.0640329122543335, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2371016889810562, + "step": 3004 + }, + { + "epoch": 0.187875, + "grad_norm": 2.328125, + "grad_norm_var": 0.28884989420572915, + "learning_rate": 0.0001, + "loss": 7.4793, + "loss/crossentropy": 2.1649757027626038, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.22954751551151276, + "step": 3006 + }, + { + "epoch": 0.188, + "grad_norm": 2.9375, + "grad_norm_var": 0.2865397135416667, + "learning_rate": 0.0001, + "loss": 7.7681, + "loss/crossentropy": 2.402653217315674, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2439199835062027, + "step": 3008 + }, + { + "epoch": 0.188125, + "grad_norm": 2.671875, + "grad_norm_var": 0.16442057291666667, + "learning_rate": 0.0001, + "loss": 7.7058, + "loss/crossentropy": 2.0360541343688965, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2370767444372177, + "step": 3010 + }, + { + "epoch": 0.18825, + "grad_norm": 2.78125, + "grad_norm_var": 0.16503499348958334, + "learning_rate": 0.0001, + "loss": 7.78, + "loss/crossentropy": 2.5376839637756348, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25698722898960114, + "step": 3012 + }, + { + "epoch": 0.188375, + "grad_norm": 2.546875, + "grad_norm_var": 0.16298421223958334, + "learning_rate": 0.0001, + "loss": 7.6674, + "loss/crossentropy": 2.153092384338379, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22976408153772354, + "step": 3014 + }, + { + "epoch": 0.1885, + "grad_norm": 2.421875, + "grad_norm_var": 0.16728413899739583, + "learning_rate": 0.0001, + "loss": 7.9659, + "loss/crossentropy": 2.4408687353134155, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2630517780780792, + "step": 3016 + }, + { + "epoch": 0.188625, + "grad_norm": 2.546875, + "grad_norm_var": 0.15907796223958334, + "learning_rate": 0.0001, + "loss": 7.6187, + "loss/crossentropy": 2.1757636070251465, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2302849441766739, + "step": 3018 + }, + { + "epoch": 0.18875, + "grad_norm": 2.71875, + "grad_norm_var": 0.03792215983072917, + "learning_rate": 0.0001, + "loss": 7.5604, + "loss/crossentropy": 2.064240336418152, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22439640015363693, + "step": 3020 + }, + { + "epoch": 0.188875, + "grad_norm": 2.46875, + "grad_norm_var": 0.03224995930989583, + "learning_rate": 0.0001, + "loss": 7.6954, + "loss/crossentropy": 2.1221320629119873, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22975972294807434, + "step": 3022 + }, + { + "epoch": 0.189, + "grad_norm": 2.59375, + "grad_norm_var": 0.023714192708333335, + "learning_rate": 0.0001, + "loss": 7.8211, + "loss/crossentropy": 2.243198275566101, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2282658889889717, + "step": 3024 + }, + { + "epoch": 0.189125, + "grad_norm": 2.5, + "grad_norm_var": 0.0191558837890625, + "learning_rate": 0.0001, + "loss": 7.6639, + "loss/crossentropy": 1.9989042282104492, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23544684797525406, + "step": 3026 + }, + { + "epoch": 0.18925, + "grad_norm": 2.671875, + "grad_norm_var": 0.015510050455729167, + "learning_rate": 0.0001, + "loss": 7.6059, + "loss/crossentropy": 2.0583502054214478, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22483647614717484, + "step": 3028 + }, + { + "epoch": 0.189375, + "grad_norm": 2.5625, + "grad_norm_var": 0.019481404622395834, + "learning_rate": 0.0001, + "loss": 7.5609, + "loss/crossentropy": 2.213624954223633, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23435892909765244, + "step": 3030 + }, + { + "epoch": 0.1895, + "grad_norm": 2.625, + "grad_norm_var": 0.015445963541666666, + "learning_rate": 0.0001, + "loss": 7.7361, + "loss/crossentropy": 2.2522560358047485, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24112869054079056, + "step": 3032 + }, + { + "epoch": 0.189625, + "grad_norm": 2.453125, + "grad_norm_var": 0.011865234375, + "learning_rate": 0.0001, + "loss": 7.9651, + "loss/crossentropy": 2.325987696647644, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22860444337129593, + "step": 3034 + }, + { + "epoch": 0.18975, + "grad_norm": 2.75, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 7.6774, + "loss/crossentropy": 2.292188882827759, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24093221873044968, + "step": 3036 + }, + { + "epoch": 0.189875, + "grad_norm": 2.859375, + "grad_norm_var": 0.020677693684895835, + "learning_rate": 0.0001, + "loss": 7.7072, + "loss/crossentropy": 2.1392345428466797, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2271246463060379, + "step": 3038 + }, + { + "epoch": 0.19, + "grad_norm": 2.328125, + "grad_norm_var": 0.025584920247395834, + "learning_rate": 0.0001, + "loss": 7.7131, + "loss/crossentropy": 2.3634947538375854, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2574180141091347, + "step": 3040 + }, + { + "epoch": 0.190125, + "grad_norm": 2.5, + "grad_norm_var": 0.025406901041666666, + "learning_rate": 0.0001, + "loss": 7.6816, + "loss/crossentropy": 2.224321484565735, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22878948599100113, + "step": 3042 + }, + { + "epoch": 0.19025, + "grad_norm": 2.65625, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 7.7158, + "loss/crossentropy": 2.471584916114807, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2526979222893715, + "step": 3044 + }, + { + "epoch": 0.190375, + "grad_norm": 2.703125, + "grad_norm_var": 0.02271728515625, + "learning_rate": 0.0001, + "loss": 7.4846, + "loss/crossentropy": 2.2407480478286743, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.23275888711214066, + "step": 3046 + }, + { + "epoch": 0.1905, + "grad_norm": 2.484375, + "grad_norm_var": 0.023140462239583333, + "learning_rate": 0.0001, + "loss": 7.7875, + "loss/crossentropy": 2.2426388263702393, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23194129765033722, + "step": 3048 + }, + { + "epoch": 0.190625, + "grad_norm": 2.921875, + "grad_norm_var": 0.029488118489583333, + "learning_rate": 0.0001, + "loss": 7.6974, + "loss/crossentropy": 2.3478230237960815, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24639128148555756, + "step": 3050 + }, + { + "epoch": 0.19075, + "grad_norm": 2.390625, + "grad_norm_var": 0.030973307291666665, + "learning_rate": 0.0001, + "loss": 7.8786, + "loss/crossentropy": 2.3184871673583984, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2819037437438965, + "step": 3052 + }, + { + "epoch": 0.190875, + "grad_norm": 2.59375, + "grad_norm_var": 0.025520833333333333, + "learning_rate": 0.0001, + "loss": 7.6416, + "loss/crossentropy": 2.140998363494873, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22248034179210663, + "step": 3054 + }, + { + "epoch": 0.191, + "grad_norm": 2.40625, + "grad_norm_var": 0.024540201822916666, + "learning_rate": 0.0001, + "loss": 7.7976, + "loss/crossentropy": 2.492767333984375, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2436331883072853, + "step": 3056 + }, + { + "epoch": 0.191125, + "grad_norm": 2.796875, + "grad_norm_var": 0.026851399739583334, + "learning_rate": 0.0001, + "loss": 7.6777, + "loss/crossentropy": 1.9927314519882202, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22899659723043442, + "step": 3058 + }, + { + "epoch": 0.19125, + "grad_norm": 2.5, + "grad_norm_var": 0.0286285400390625, + "learning_rate": 0.0001, + "loss": 7.9659, + "loss/crossentropy": 2.2941343784332275, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.24661502987146378, + "step": 3060 + }, + { + "epoch": 0.191375, + "grad_norm": 2.5, + "grad_norm_var": 0.02808837890625, + "learning_rate": 0.0001, + "loss": 7.8332, + "loss/crossentropy": 2.2587934732437134, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.23479507118463516, + "step": 3062 + }, + { + "epoch": 0.1915, + "grad_norm": 2.484375, + "grad_norm_var": 0.0285797119140625, + "learning_rate": 0.0001, + "loss": 7.7855, + "loss/crossentropy": 2.448140263557434, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24534112215042114, + "step": 3064 + }, + { + "epoch": 0.191625, + "grad_norm": 2.734375, + "grad_norm_var": 0.022802734375, + "learning_rate": 0.0001, + "loss": 7.6054, + "loss/crossentropy": 2.5346790552139282, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2549649178981781, + "step": 3066 + }, + { + "epoch": 0.19175, + "grad_norm": 2.390625, + "grad_norm_var": 0.0201324462890625, + "learning_rate": 0.0001, + "loss": 7.6652, + "loss/crossentropy": 2.139198064804077, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.22500670701265335, + "step": 3068 + }, + { + "epoch": 0.191875, + "grad_norm": 2.5, + "grad_norm_var": 0.020052083333333335, + "learning_rate": 0.0001, + "loss": 7.6343, + "loss/crossentropy": 2.2101441621780396, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.24504300951957703, + "step": 3070 + }, + { + "epoch": 0.192, + "grad_norm": 2.78125, + "grad_norm_var": 0.018387858072916666, + "learning_rate": 0.0001, + "loss": 7.8094, + "loss/crossentropy": 2.387241005897522, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.23291389644145966, + "step": 3072 + }, + { + "epoch": 0.192125, + "grad_norm": 2.515625, + "grad_norm_var": 0.015632120768229167, + "learning_rate": 0.0001, + "loss": 7.657, + "loss/crossentropy": 2.015101671218872, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2168959379196167, + "step": 3074 + }, + { + "epoch": 0.19225, + "grad_norm": 2.359375, + "grad_norm_var": 0.016044108072916667, + "learning_rate": 0.0001, + "loss": 7.5738, + "loss/crossentropy": 2.201832890510559, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.25159038603305817, + "step": 3076 + }, + { + "epoch": 0.192375, + "grad_norm": 2.640625, + "grad_norm_var": 0.017699178059895834, + "learning_rate": 0.0001, + "loss": 7.7274, + "loss/crossentropy": 2.4729052782058716, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2636168450117111, + "step": 3078 + }, + { + "epoch": 0.1925, + "grad_norm": 2.5625, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 7.4003, + "loss/crossentropy": 2.096401810646057, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.22800646722316742, + "step": 3080 + }, + { + "epoch": 0.192625, + "grad_norm": 2.71875, + "grad_norm_var": 0.017316691080729165, + "learning_rate": 0.0001, + "loss": 7.5855, + "loss/crossentropy": 2.1243752241134644, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2130807489156723, + "step": 3082 + }, + { + "epoch": 0.19275, + "grad_norm": 2.59375, + "grad_norm_var": 0.016258748372395833, + "learning_rate": 0.0001, + "loss": 7.6234, + "loss/crossentropy": 2.393889904022217, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23582585901021957, + "step": 3084 + }, + { + "epoch": 0.192875, + "grad_norm": 2.453125, + "grad_norm_var": 0.0202789306640625, + "learning_rate": 0.0001, + "loss": 7.7359, + "loss/crossentropy": 2.530544877052307, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.23919443786144257, + "step": 3086 + }, + { + "epoch": 0.193, + "grad_norm": 2.453125, + "grad_norm_var": 0.018895467122395832, + "learning_rate": 0.0001, + "loss": 7.7122, + "loss/crossentropy": 2.14614474773407, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23280902951955795, + "step": 3088 + }, + { + "epoch": 0.193125, + "grad_norm": 2.546875, + "grad_norm_var": 0.025519816080729167, + "learning_rate": 0.0001, + "loss": 7.6703, + "loss/crossentropy": 2.1698378324508667, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23564688116312027, + "step": 3090 + }, + { + "epoch": 0.19325, + "grad_norm": 2.296875, + "grad_norm_var": 0.0282379150390625, + "learning_rate": 0.0001, + "loss": 7.5935, + "loss/crossentropy": 2.2789262533187866, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23355238884687424, + "step": 3092 + }, + { + "epoch": 0.193375, + "grad_norm": 2.78125, + "grad_norm_var": 0.030013020833333334, + "learning_rate": 0.0001, + "loss": 7.8383, + "loss/crossentropy": 2.548181891441345, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.242641419172287, + "step": 3094 + }, + { + "epoch": 0.1935, + "grad_norm": 2.4375, + "grad_norm_var": 0.031525675455729166, + "learning_rate": 0.0001, + "loss": 7.5252, + "loss/crossentropy": 2.34587025642395, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2449270710349083, + "step": 3096 + }, + { + "epoch": 0.193625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04414774576822917, + "learning_rate": 0.0001, + "loss": 7.5994, + "loss/crossentropy": 2.2254514694213867, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2519787400960922, + "step": 3098 + }, + { + "epoch": 0.19375, + "grad_norm": 2.640625, + "grad_norm_var": 0.060301717122395834, + "learning_rate": 0.0001, + "loss": 7.623, + "loss/crossentropy": 2.3490875959396362, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.24305754899978638, + "step": 3100 + }, + { + "epoch": 0.193875, + "grad_norm": 2.34375, + "grad_norm_var": 0.0575836181640625, + "learning_rate": 0.0001, + "loss": 7.7137, + "loss/crossentropy": 2.021351933479309, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2211918607354164, + "step": 3102 + }, + { + "epoch": 0.194, + "grad_norm": 2.8125, + "grad_norm_var": 0.060791015625, + "learning_rate": 0.0001, + "loss": 7.7423, + "loss/crossentropy": 2.1594278812408447, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.24009329080581665, + "step": 3104 + }, + { + "epoch": 0.194125, + "grad_norm": 2.578125, + "grad_norm_var": 0.059137980143229164, + "learning_rate": 0.0001, + "loss": 7.7889, + "loss/crossentropy": 2.459377884864807, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.251203328371048, + "step": 3106 + }, + { + "epoch": 0.19425, + "grad_norm": 2.921875, + "grad_norm_var": 0.05748291015625, + "learning_rate": 0.0001, + "loss": 7.5124, + "loss/crossentropy": 2.0707362294197083, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.21844393759965897, + "step": 3108 + }, + { + "epoch": 0.194375, + "grad_norm": 2.453125, + "grad_norm_var": 0.101025390625, + "learning_rate": 0.0001, + "loss": 7.6732, + "loss/crossentropy": 2.2419523000717163, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2376825362443924, + "step": 3110 + }, + { + "epoch": 0.1945, + "grad_norm": 2.46875, + "grad_norm_var": 0.10181376139322916, + "learning_rate": 0.0001, + "loss": 7.6479, + "loss/crossentropy": 2.3259233236312866, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.25602586567401886, + "step": 3112 + }, + { + "epoch": 0.194625, + "grad_norm": 2.796875, + "grad_norm_var": 0.0926910400390625, + "learning_rate": 0.0001, + "loss": 7.5447, + "loss/crossentropy": 1.9941769242286682, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21642977744340897, + "step": 3114 + }, + { + "epoch": 0.19475, + "grad_norm": 2.578125, + "grad_norm_var": 0.09576822916666666, + "learning_rate": 0.0001, + "loss": 7.8191, + "loss/crossentropy": 2.5602025985717773, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.25449611991643906, + "step": 3116 + }, + { + "epoch": 0.194875, + "grad_norm": 2.484375, + "grad_norm_var": 0.09041239420572916, + "learning_rate": 0.0001, + "loss": 7.8136, + "loss/crossentropy": 2.3339617252349854, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.23222877830266953, + "step": 3118 + }, + { + "epoch": 0.195, + "grad_norm": 2.453125, + "grad_norm_var": 0.09040425618489584, + "learning_rate": 0.0001, + "loss": 7.7202, + "loss/crossentropy": 2.4900479316711426, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24146823585033417, + "step": 3120 + }, + { + "epoch": 0.195125, + "grad_norm": 2.734375, + "grad_norm_var": 0.09036356608072917, + "learning_rate": 0.0001, + "loss": 7.6583, + "loss/crossentropy": 2.191547393798828, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2142081782221794, + "step": 3122 + }, + { + "epoch": 0.19525, + "grad_norm": 2.375, + "grad_norm_var": 0.090185546875, + "learning_rate": 0.0001, + "loss": 7.6174, + "loss/crossentropy": 2.3771393299102783, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2471369206905365, + "step": 3124 + }, + { + "epoch": 0.195375, + "grad_norm": 2.625, + "grad_norm_var": 0.04331766764322917, + "learning_rate": 0.0001, + "loss": 7.5707, + "loss/crossentropy": 2.3110562562942505, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23804593086242676, + "step": 3126 + }, + { + "epoch": 0.1955, + "grad_norm": 2.5, + "grad_norm_var": 0.040379842122395836, + "learning_rate": 0.0001, + "loss": 7.6465, + "loss/crossentropy": 2.0701069831848145, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23578406125307083, + "step": 3128 + }, + { + "epoch": 0.195625, + "grad_norm": 2.421875, + "grad_norm_var": 0.038134765625, + "learning_rate": 0.0001, + "loss": 7.6923, + "loss/crossentropy": 2.2268728017807007, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22668010741472244, + "step": 3130 + }, + { + "epoch": 0.19575, + "grad_norm": 2.546875, + "grad_norm_var": 0.018724568684895835, + "learning_rate": 0.0001, + "loss": 7.7925, + "loss/crossentropy": 2.288881540298462, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2400398850440979, + "step": 3132 + }, + { + "epoch": 0.195875, + "grad_norm": 2.546875, + "grad_norm_var": 0.019310506184895833, + "learning_rate": 0.0001, + "loss": 7.5776, + "loss/crossentropy": 2.3762770891189575, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24239980429410934, + "step": 3134 + }, + { + "epoch": 0.196, + "grad_norm": 2.359375, + "grad_norm_var": 0.022037760416666666, + "learning_rate": 0.0001, + "loss": 7.5924, + "loss/crossentropy": 2.0780075788497925, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166167050600052, + "step": 3136 + }, + { + "epoch": 0.196125, + "grad_norm": 2.359375, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 7.6296, + "loss/crossentropy": 2.2321070432662964, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.257367268204689, + "step": 3138 + }, + { + "epoch": 0.19625, + "grad_norm": 2.5, + "grad_norm_var": 0.0124664306640625, + "learning_rate": 0.0001, + "loss": 7.655, + "loss/crossentropy": 2.2101333141326904, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22183632850646973, + "step": 3140 + }, + { + "epoch": 0.196375, + "grad_norm": 2.5, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 7.7539, + "loss/crossentropy": 2.3512450456619263, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2585148215293884, + "step": 3142 + }, + { + "epoch": 0.1965, + "grad_norm": 2.578125, + "grad_norm_var": 0.012360636393229167, + "learning_rate": 0.0001, + "loss": 7.6486, + "loss/crossentropy": 2.224077582359314, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.23895263671875, + "step": 3144 + }, + { + "epoch": 0.196625, + "grad_norm": 2.40625, + "grad_norm_var": 0.01246337890625, + "learning_rate": 0.0001, + "loss": 7.5564, + "loss/crossentropy": 2.1337246894836426, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2377912774682045, + "step": 3146 + }, + { + "epoch": 0.19675, + "grad_norm": 2.46875, + "grad_norm_var": 0.017671712239583335, + "learning_rate": 0.0001, + "loss": 7.6371, + "loss/crossentropy": 2.1832423210144043, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2349269688129425, + "step": 3148 + }, + { + "epoch": 0.196875, + "grad_norm": 2.515625, + "grad_norm_var": 0.0158203125, + "learning_rate": 0.0001, + "loss": 7.6147, + "loss/crossentropy": 2.2364492416381836, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24817125499248505, + "step": 3150 + }, + { + "epoch": 0.197, + "grad_norm": 2.21875, + "grad_norm_var": 0.019254557291666665, + "learning_rate": 0.0001, + "loss": 7.5449, + "loss/crossentropy": 2.30988085269928, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.26015302538871765, + "step": 3152 + }, + { + "epoch": 0.197125, + "grad_norm": 2.671875, + "grad_norm_var": 0.020392862955729167, + "learning_rate": 0.0001, + "loss": 7.7401, + "loss/crossentropy": 2.053266227245331, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24950231611728668, + "step": 3154 + }, + { + "epoch": 0.19725, + "grad_norm": 2.640625, + "grad_norm_var": 0.021126302083333333, + "learning_rate": 0.0001, + "loss": 7.6193, + "loss/crossentropy": 2.2058684825897217, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23218996822834015, + "step": 3156 + }, + { + "epoch": 0.197375, + "grad_norm": 2.671875, + "grad_norm_var": 0.031148274739583332, + "learning_rate": 0.0001, + "loss": 7.83, + "loss/crossentropy": 2.3524067401885986, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24826574325561523, + "step": 3158 + }, + { + "epoch": 0.1975, + "grad_norm": 2.609375, + "grad_norm_var": 0.032079060872395836, + "learning_rate": 0.0001, + "loss": 7.6028, + "loss/crossentropy": 2.124649167060852, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23834124207496643, + "step": 3160 + }, + { + "epoch": 0.197625, + "grad_norm": 3.203125, + "grad_norm_var": 0.05683186848958333, + "learning_rate": 0.0001, + "loss": 7.6409, + "loss/crossentropy": 2.170323371887207, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24094149470329285, + "step": 3162 + }, + { + "epoch": 0.19775, + "grad_norm": 2.65625, + "grad_norm_var": 0.06315816243489583, + "learning_rate": 0.0001, + "loss": 7.7351, + "loss/crossentropy": 2.340814709663391, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2378515675663948, + "step": 3164 + }, + { + "epoch": 0.197875, + "grad_norm": 2.640625, + "grad_norm_var": 0.0582672119140625, + "learning_rate": 0.0001, + "loss": 7.7131, + "loss/crossentropy": 2.454757571220398, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.25292622298002243, + "step": 3166 + }, + { + "epoch": 0.198, + "grad_norm": 2.59375, + "grad_norm_var": 0.0470703125, + "learning_rate": 0.0001, + "loss": 7.7054, + "loss/crossentropy": 2.3518433570861816, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2326052561402321, + "step": 3168 + }, + { + "epoch": 0.198125, + "grad_norm": 2.375, + "grad_norm_var": 0.049103800455729166, + "learning_rate": 0.0001, + "loss": 7.8036, + "loss/crossentropy": 2.3869314193725586, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2253723442554474, + "step": 3170 + }, + { + "epoch": 0.19825, + "grad_norm": 2.453125, + "grad_norm_var": 0.05646158854166667, + "learning_rate": 0.0001, + "loss": 7.549, + "loss/crossentropy": 2.1406772136688232, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2136363908648491, + "step": 3172 + }, + { + "epoch": 0.198375, + "grad_norm": 2.65625, + "grad_norm_var": 0.050959269205729164, + "learning_rate": 0.0001, + "loss": 7.7547, + "loss/crossentropy": 2.276672065258026, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2241700440645218, + "step": 3174 + }, + { + "epoch": 0.1985, + "grad_norm": 2.234375, + "grad_norm_var": 0.056696573893229164, + "learning_rate": 0.0001, + "loss": 7.5706, + "loss/crossentropy": 2.0704278349876404, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.22979146987199783, + "step": 3176 + }, + { + "epoch": 0.198625, + "grad_norm": 2.625, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 7.6319, + "loss/crossentropy": 2.155359983444214, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.23411893844604492, + "step": 3178 + }, + { + "epoch": 0.19875, + "grad_norm": 2.890625, + "grad_norm_var": 0.026708984375, + "learning_rate": 0.0001, + "loss": 7.6763, + "loss/crossentropy": 2.17472767829895, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2461443468928337, + "step": 3180 + }, + { + "epoch": 0.198875, + "grad_norm": 2.515625, + "grad_norm_var": 0.02847900390625, + "learning_rate": 0.0001, + "loss": 7.61, + "loss/crossentropy": 2.401307702064514, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2419404238462448, + "step": 3182 + }, + { + "epoch": 0.199, + "grad_norm": 2.359375, + "grad_norm_var": 0.03395894368489583, + "learning_rate": 0.0001, + "loss": 7.5815, + "loss/crossentropy": 2.303532361984253, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23136408627033234, + "step": 3184 + }, + { + "epoch": 0.199125, + "grad_norm": 2.8125, + "grad_norm_var": 0.03798726399739583, + "learning_rate": 0.0001, + "loss": 7.7585, + "loss/crossentropy": 2.691552758216858, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24934251606464386, + "step": 3186 + }, + { + "epoch": 0.19925, + "grad_norm": 2.234375, + "grad_norm_var": 0.041975911458333334, + "learning_rate": 0.0001, + "loss": 7.6867, + "loss/crossentropy": 2.1433998346328735, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23058529198169708, + "step": 3188 + }, + { + "epoch": 0.199375, + "grad_norm": 2.484375, + "grad_norm_var": 0.04368387858072917, + "learning_rate": 0.0001, + "loss": 7.6658, + "loss/crossentropy": 2.075712561607361, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23563528060913086, + "step": 3190 + }, + { + "epoch": 0.1995, + "grad_norm": 2.515625, + "grad_norm_var": 0.038849894205729166, + "learning_rate": 0.0001, + "loss": 7.6327, + "loss/crossentropy": 2.239920735359192, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24973846971988678, + "step": 3192 + }, + { + "epoch": 0.199625, + "grad_norm": 2.5625, + "grad_norm_var": 0.03837788899739583, + "learning_rate": 0.0001, + "loss": 7.6491, + "loss/crossentropy": 2.2711654901504517, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2149556428194046, + "step": 3194 + }, + { + "epoch": 0.19975, + "grad_norm": 2.703125, + "grad_norm_var": 0.033543904622395836, + "learning_rate": 0.0001, + "loss": 7.6259, + "loss/crossentropy": 2.2742727994918823, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24378067255020142, + "step": 3196 + }, + { + "epoch": 0.199875, + "grad_norm": 2.296875, + "grad_norm_var": 0.03476155598958333, + "learning_rate": 0.0001, + "loss": 7.6477, + "loss/crossentropy": 2.074104130268097, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22755059599876404, + "step": 3198 + }, + { + "epoch": 0.2, + "grad_norm": 2.71875, + "grad_norm_var": 0.03190816243489583, + "learning_rate": 0.0001, + "loss": 7.4978, + "loss/crossentropy": 2.2103521823883057, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22538809478282928, + "step": 3200 + }, + { + "epoch": 0.200125, + "grad_norm": 2.28125, + "grad_norm_var": 0.027132161458333335, + "learning_rate": 0.0001, + "loss": 7.4522, + "loss/crossentropy": 2.262304186820984, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.22689391672611237, + "step": 3202 + }, + { + "epoch": 0.20025, + "grad_norm": 2.484375, + "grad_norm_var": 0.024527994791666667, + "learning_rate": 0.0001, + "loss": 7.6634, + "loss/crossentropy": 2.159933626651764, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2550048828125, + "step": 3204 + }, + { + "epoch": 0.200375, + "grad_norm": 2.5, + "grad_norm_var": 0.022847493489583332, + "learning_rate": 0.0001, + "loss": 7.7359, + "loss/crossentropy": 2.2928545475006104, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2492647022008896, + "step": 3206 + }, + { + "epoch": 0.2005, + "grad_norm": 2.453125, + "grad_norm_var": 0.022435506184895832, + "learning_rate": 0.0001, + "loss": 7.6104, + "loss/crossentropy": 2.151831030845642, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21393808722496033, + "step": 3208 + }, + { + "epoch": 0.200625, + "grad_norm": 2.46875, + "grad_norm_var": 0.022005208333333335, + "learning_rate": 0.0001, + "loss": 7.8164, + "loss/crossentropy": 2.166096329689026, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23471974581480026, + "step": 3210 + }, + { + "epoch": 0.20075, + "grad_norm": 2.734375, + "grad_norm_var": 0.018277994791666665, + "learning_rate": 0.0001, + "loss": 7.565, + "loss/crossentropy": 2.181807518005371, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23415996134281158, + "step": 3212 + }, + { + "epoch": 0.200875, + "grad_norm": 2.453125, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 7.5114, + "loss/crossentropy": 2.1912107467651367, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22295787930488586, + "step": 3214 + }, + { + "epoch": 0.201, + "grad_norm": 2.453125, + "grad_norm_var": 0.01558837890625, + "learning_rate": 0.0001, + "loss": 7.4921, + "loss/crossentropy": 2.297171950340271, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.25197841227054596, + "step": 3216 + }, + { + "epoch": 0.201125, + "grad_norm": 2.359375, + "grad_norm_var": 0.015364583333333333, + "learning_rate": 0.0001, + "loss": 7.7764, + "loss/crossentropy": 2.484106659889221, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2731506675481796, + "step": 3218 + }, + { + "epoch": 0.20125, + "grad_norm": 2.359375, + "grad_norm_var": 0.017975870768229166, + "learning_rate": 0.0001, + "loss": 7.6025, + "loss/crossentropy": 2.156631350517273, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21255087107419968, + "step": 3220 + }, + { + "epoch": 0.201375, + "grad_norm": 2.421875, + "grad_norm_var": 0.0179351806640625, + "learning_rate": 0.0001, + "loss": 7.5934, + "loss/crossentropy": 2.3413909673690796, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22059939801692963, + "step": 3222 + }, + { + "epoch": 0.2015, + "grad_norm": 3.203125, + "grad_norm_var": 0.05054423014322917, + "learning_rate": 0.0001, + "loss": 7.6958, + "loss/crossentropy": 2.313757300376892, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23688851296901703, + "step": 3224 + }, + { + "epoch": 0.201625, + "grad_norm": 2.640625, + "grad_norm_var": 0.052643839518229166, + "learning_rate": 0.0001, + "loss": 7.5624, + "loss/crossentropy": 2.3218902349472046, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.23602160066366196, + "step": 3226 + }, + { + "epoch": 0.20175, + "grad_norm": 2.421875, + "grad_norm_var": 0.0511138916015625, + "learning_rate": 0.0001, + "loss": 7.6753, + "loss/crossentropy": 2.6514742374420166, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.2617443650960922, + "step": 3228 + }, + { + "epoch": 0.201875, + "grad_norm": 2.328125, + "grad_norm_var": 0.05115559895833333, + "learning_rate": 0.0001, + "loss": 7.7007, + "loss/crossentropy": 2.27648389339447, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.23478404432535172, + "step": 3230 + }, + { + "epoch": 0.202, + "grad_norm": 2.75, + "grad_norm_var": 0.05434468587239583, + "learning_rate": 0.0001, + "loss": 7.7911, + "loss/crossentropy": 2.3282746076583862, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24819976091384888, + "step": 3232 + }, + { + "epoch": 0.202125, + "grad_norm": 2.359375, + "grad_norm_var": 0.05576883951822917, + "learning_rate": 0.0001, + "loss": 7.5815, + "loss/crossentropy": 2.0597460865974426, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.23130206763744354, + "step": 3234 + }, + { + "epoch": 0.20225, + "grad_norm": 2.484375, + "grad_norm_var": 0.04765625, + "learning_rate": 0.0001, + "loss": 7.777, + "loss/crossentropy": 2.3545076847076416, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22461315244436264, + "step": 3236 + }, + { + "epoch": 0.202375, + "grad_norm": 2.734375, + "grad_norm_var": 0.045506795247395836, + "learning_rate": 0.0001, + "loss": 7.6893, + "loss/crossentropy": 2.2642041444778442, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24759702384471893, + "step": 3238 + }, + { + "epoch": 0.2025, + "grad_norm": 2.359375, + "grad_norm_var": 0.020438639322916667, + "learning_rate": 0.0001, + "loss": 7.5394, + "loss/crossentropy": 2.405397891998291, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24760686606168747, + "step": 3240 + }, + { + "epoch": 0.202625, + "grad_norm": 2.4375, + "grad_norm_var": 0.018050130208333334, + "learning_rate": 0.0001, + "loss": 7.5978, + "loss/crossentropy": 1.8945466876029968, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.20826154947280884, + "step": 3242 + }, + { + "epoch": 0.20275, + "grad_norm": 2.59375, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 7.6012, + "loss/crossentropy": 2.2214646339416504, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.21611948311328888, + "step": 3244 + }, + { + "epoch": 0.202875, + "grad_norm": 2.40625, + "grad_norm_var": 0.01685791015625, + "learning_rate": 0.0001, + "loss": 7.5799, + "loss/crossentropy": 2.41989004611969, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2531380206346512, + "step": 3246 + }, + { + "epoch": 0.203, + "grad_norm": 2.75, + "grad_norm_var": 0.016499837239583332, + "learning_rate": 0.0001, + "loss": 7.6609, + "loss/crossentropy": 2.1457839012145996, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22547265142202377, + "step": 3248 + }, + { + "epoch": 0.203125, + "grad_norm": 2.515625, + "grad_norm_var": 0.011815388997395834, + "learning_rate": 0.0001, + "loss": 7.6925, + "loss/crossentropy": 2.202640414237976, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24964796006679535, + "step": 3250 + }, + { + "epoch": 0.20325, + "grad_norm": 2.59375, + "grad_norm_var": 0.01177978515625, + "learning_rate": 0.0001, + "loss": 7.8597, + "loss/crossentropy": 2.2329805493354797, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.21109139919281006, + "step": 3252 + }, + { + "epoch": 0.203375, + "grad_norm": 2.546875, + "grad_norm_var": 0.013602701822916667, + "learning_rate": 0.0001, + "loss": 7.6793, + "loss/crossentropy": 2.37657368183136, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.22429364919662476, + "step": 3254 + }, + { + "epoch": 0.2035, + "grad_norm": 2.515625, + "grad_norm_var": 0.013654581705729167, + "learning_rate": 0.0001, + "loss": 7.5988, + "loss/crossentropy": 2.240600347518921, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.24599803984165192, + "step": 3256 + }, + { + "epoch": 0.203625, + "grad_norm": 2.296875, + "grad_norm_var": 0.020670572916666668, + "learning_rate": 0.0001, + "loss": 7.6654, + "loss/crossentropy": 2.5686757564544678, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.263079509139061, + "step": 3258 + }, + { + "epoch": 0.20375, + "grad_norm": 2.53125, + "grad_norm_var": 0.023265584309895834, + "learning_rate": 0.0001, + "loss": 7.824, + "loss/crossentropy": 2.150991916656494, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2586375027894974, + "step": 3260 + }, + { + "epoch": 0.203875, + "grad_norm": 2.453125, + "grad_norm_var": 0.026048787434895835, + "learning_rate": 0.0001, + "loss": 7.6643, + "loss/crossentropy": 2.370198965072632, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23691494762897491, + "step": 3262 + }, + { + "epoch": 0.204, + "grad_norm": 2.28125, + "grad_norm_var": 0.029390462239583335, + "learning_rate": 0.0001, + "loss": 7.5166, + "loss/crossentropy": 2.2037036418914795, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.23511488735675812, + "step": 3264 + }, + { + "epoch": 0.204125, + "grad_norm": 2.625, + "grad_norm_var": 0.032548014322916666, + "learning_rate": 0.0001, + "loss": 7.631, + "loss/crossentropy": 2.2161275148391724, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.22294826805591583, + "step": 3266 + }, + { + "epoch": 0.20425, + "grad_norm": 3.015625, + "grad_norm_var": 0.06555887858072916, + "learning_rate": 0.0001, + "loss": 7.8449, + "loss/crossentropy": 2.403158664703369, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.24931098520755768, + "step": 3268 + }, + { + "epoch": 0.204375, + "grad_norm": 2.4375, + "grad_norm_var": 0.06297098795572917, + "learning_rate": 0.0001, + "loss": 7.8327, + "loss/crossentropy": 2.4231287240982056, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24163633584976196, + "step": 3270 + }, + { + "epoch": 0.2045, + "grad_norm": 2.53125, + "grad_norm_var": 0.06093648274739583, + "learning_rate": 0.0001, + "loss": 7.4887, + "loss/crossentropy": 2.196571111679077, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2219029664993286, + "step": 3272 + }, + { + "epoch": 0.204625, + "grad_norm": 2.5, + "grad_norm_var": 0.05232645670572917, + "learning_rate": 0.0001, + "loss": 7.6319, + "loss/crossentropy": 2.336767315864563, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23825974017381668, + "step": 3274 + }, + { + "epoch": 0.20475, + "grad_norm": 2.25, + "grad_norm_var": 0.05592041015625, + "learning_rate": 0.0001, + "loss": 7.4521, + "loss/crossentropy": 1.9283623099327087, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2104254812002182, + "step": 3276 + }, + { + "epoch": 0.204875, + "grad_norm": 2.75, + "grad_norm_var": 0.05458882649739583, + "learning_rate": 0.0001, + "loss": 7.7259, + "loss/crossentropy": 2.250017523765564, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23542233556509018, + "step": 3278 + }, + { + "epoch": 0.205, + "grad_norm": 2.265625, + "grad_norm_var": 0.053807576497395836, + "learning_rate": 0.0001, + "loss": 7.5853, + "loss/crossentropy": 2.0355631709098816, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21646135300397873, + "step": 3280 + }, + { + "epoch": 0.205125, + "grad_norm": 2.53125, + "grad_norm_var": 0.056761678059895834, + "learning_rate": 0.0001, + "loss": 7.5638, + "loss/crossentropy": 2.1133495569229126, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2248711958527565, + "step": 3282 + }, + { + "epoch": 0.20525, + "grad_norm": 2.421875, + "grad_norm_var": 0.0180816650390625, + "learning_rate": 0.0001, + "loss": 7.6095, + "loss/crossentropy": 2.4140706062316895, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.22342108935117722, + "step": 3284 + }, + { + "epoch": 0.205375, + "grad_norm": 2.453125, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 7.6281, + "loss/crossentropy": 2.4801841974258423, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24090874940156937, + "step": 3286 + }, + { + "epoch": 0.2055, + "grad_norm": 2.671875, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 7.6194, + "loss/crossentropy": 2.333125591278076, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23281607031822205, + "step": 3288 + }, + { + "epoch": 0.205625, + "grad_norm": 2.328125, + "grad_norm_var": 0.022443644205729165, + "learning_rate": 0.0001, + "loss": 7.5728, + "loss/crossentropy": 2.057736098766327, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.21230417490005493, + "step": 3290 + }, + { + "epoch": 0.20575, + "grad_norm": 2.5, + "grad_norm_var": 0.019025675455729165, + "learning_rate": 0.0001, + "loss": 7.5554, + "loss/crossentropy": 2.2296417951583862, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24230563640594482, + "step": 3292 + }, + { + "epoch": 0.205875, + "grad_norm": 2.640625, + "grad_norm_var": 0.0163482666015625, + "learning_rate": 0.0001, + "loss": 7.5161, + "loss/crossentropy": 2.4877594709396362, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23580122739076614, + "step": 3294 + }, + { + "epoch": 0.206, + "grad_norm": 2.65625, + "grad_norm_var": 0.015816243489583333, + "learning_rate": 0.0001, + "loss": 7.6607, + "loss/crossentropy": 2.22783100605011, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23493072390556335, + "step": 3296 + }, + { + "epoch": 0.206125, + "grad_norm": 3.484375, + "grad_norm_var": 0.06994527180989583, + "learning_rate": 0.0001, + "loss": 7.548, + "loss/crossentropy": 2.177275240421295, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23492375016212463, + "step": 3298 + }, + { + "epoch": 0.20625, + "grad_norm": 2.1875, + "grad_norm_var": 0.07803446451822917, + "learning_rate": 0.0001, + "loss": 7.612, + "loss/crossentropy": 2.100243628025055, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.21778713911771774, + "step": 3300 + }, + { + "epoch": 0.206375, + "grad_norm": 2.40625, + "grad_norm_var": 0.08179931640625, + "learning_rate": 0.0001, + "loss": 7.6651, + "loss/crossentropy": 2.612051248550415, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.236809641122818, + "step": 3302 + }, + { + "epoch": 0.2065, + "grad_norm": 2.53125, + "grad_norm_var": 0.10703125, + "learning_rate": 0.0001, + "loss": 7.7176, + "loss/crossentropy": 2.3358840942382812, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22349786013364792, + "step": 3304 + }, + { + "epoch": 0.206625, + "grad_norm": 2.390625, + "grad_norm_var": 0.10436197916666666, + "learning_rate": 0.0001, + "loss": 7.6017, + "loss/crossentropy": 2.0664124488830566, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22234025597572327, + "step": 3306 + }, + { + "epoch": 0.20675, + "grad_norm": 2.765625, + "grad_norm_var": 0.105126953125, + "learning_rate": 0.0001, + "loss": 7.7791, + "loss/crossentropy": 2.3125079870224, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.22895997017621994, + "step": 3308 + }, + { + "epoch": 0.206875, + "grad_norm": 2.5625, + "grad_norm_var": 0.10261942545572916, + "learning_rate": 0.0001, + "loss": 7.761, + "loss/crossentropy": 2.0990543365478516, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.22909457981586456, + "step": 3310 + }, + { + "epoch": 0.207, + "grad_norm": 2.359375, + "grad_norm_var": 0.11274312337239584, + "learning_rate": 0.0001, + "loss": 7.6989, + "loss/crossentropy": 2.3832221031188965, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23881246894598007, + "step": 3312 + }, + { + "epoch": 0.207125, + "grad_norm": 2.5625, + "grad_norm_var": 0.053099568684895834, + "learning_rate": 0.0001, + "loss": 7.5829, + "loss/crossentropy": 2.070538818836212, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.22222436219453812, + "step": 3314 + }, + { + "epoch": 0.20725, + "grad_norm": 2.359375, + "grad_norm_var": 0.0491607666015625, + "learning_rate": 0.0001, + "loss": 7.7099, + "loss/crossentropy": 2.0685949325561523, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.225556842982769, + "step": 3316 + }, + { + "epoch": 0.207375, + "grad_norm": 2.453125, + "grad_norm_var": 0.0464752197265625, + "learning_rate": 0.0001, + "loss": 7.5792, + "loss/crossentropy": 2.2039034366607666, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23356658220291138, + "step": 3318 + }, + { + "epoch": 0.2075, + "grad_norm": 2.359375, + "grad_norm_var": 0.017772420247395834, + "learning_rate": 0.0001, + "loss": 7.6484, + "loss/crossentropy": 2.520377278327942, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23397985100746155, + "step": 3320 + }, + { + "epoch": 0.207625, + "grad_norm": 2.421875, + "grad_norm_var": 0.018724568684895835, + "learning_rate": 0.0001, + "loss": 7.6201, + "loss/crossentropy": 2.232245087623596, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2378198802471161, + "step": 3322 + }, + { + "epoch": 0.20775, + "grad_norm": 2.703125, + "grad_norm_var": 0.017574055989583334, + "learning_rate": 0.0001, + "loss": 7.7734, + "loss/crossentropy": 2.3666106462478638, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22776059806346893, + "step": 3324 + }, + { + "epoch": 0.207875, + "grad_norm": 2.453125, + "grad_norm_var": 0.026090494791666665, + "learning_rate": 0.0001, + "loss": 7.692, + "loss/crossentropy": 2.4405359029769897, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23942655324935913, + "step": 3326 + }, + { + "epoch": 0.208, + "grad_norm": 2.3125, + "grad_norm_var": 0.026097615559895832, + "learning_rate": 0.0001, + "loss": 7.4516, + "loss/crossentropy": 2.165894627571106, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.21770837903022766, + "step": 3328 + }, + { + "epoch": 0.208125, + "grad_norm": 2.3125, + "grad_norm_var": 0.026106770833333334, + "learning_rate": 0.0001, + "loss": 7.5851, + "loss/crossentropy": 2.2975982427597046, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24745838344097137, + "step": 3330 + }, + { + "epoch": 0.20825, + "grad_norm": 2.28125, + "grad_norm_var": 0.0307769775390625, + "learning_rate": 0.0001, + "loss": 7.5979, + "loss/crossentropy": 2.1849515438079834, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23390965163707733, + "step": 3332 + }, + { + "epoch": 0.208375, + "grad_norm": 2.484375, + "grad_norm_var": 0.028743489583333334, + "learning_rate": 0.0001, + "loss": 7.7235, + "loss/crossentropy": 2.336664915084839, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22321896255016327, + "step": 3334 + }, + { + "epoch": 0.2085, + "grad_norm": 2.625, + "grad_norm_var": 0.029752604166666665, + "learning_rate": 0.0001, + "loss": 7.6849, + "loss/crossentropy": 2.4030131101608276, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24722521752119064, + "step": 3336 + }, + { + "epoch": 0.208625, + "grad_norm": 2.40625, + "grad_norm_var": 0.028348795572916665, + "learning_rate": 0.0001, + "loss": 7.6468, + "loss/crossentropy": 2.451479196548462, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.2548409700393677, + "step": 3338 + }, + { + "epoch": 0.20875, + "grad_norm": 2.53125, + "grad_norm_var": 0.024079386393229166, + "learning_rate": 0.0001, + "loss": 7.7279, + "loss/crossentropy": 2.391486406326294, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24848993867635727, + "step": 3340 + }, + { + "epoch": 0.208875, + "grad_norm": 2.5625, + "grad_norm_var": 0.015412394205729167, + "learning_rate": 0.0001, + "loss": 7.6761, + "loss/crossentropy": 2.242367148399353, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23639392107725143, + "step": 3342 + }, + { + "epoch": 0.209, + "grad_norm": 2.296875, + "grad_norm_var": 0.016487630208333333, + "learning_rate": 0.0001, + "loss": 7.6526, + "loss/crossentropy": 2.131048798561096, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2150106355547905, + "step": 3344 + }, + { + "epoch": 0.209125, + "grad_norm": 2.359375, + "grad_norm_var": 0.0172515869140625, + "learning_rate": 0.0001, + "loss": 7.4834, + "loss/crossentropy": 2.2289204597473145, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2406204640865326, + "step": 3346 + }, + { + "epoch": 0.20925, + "grad_norm": 3.078125, + "grad_norm_var": 0.04006245930989583, + "learning_rate": 0.0001, + "loss": 7.6395, + "loss/crossentropy": 2.335645318031311, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360967919230461, + "step": 3348 + }, + { + "epoch": 0.209375, + "grad_norm": 2.3125, + "grad_norm_var": 0.0597076416015625, + "learning_rate": 0.0001, + "loss": 7.6344, + "loss/crossentropy": 2.3322980403900146, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2533845752477646, + "step": 3350 + }, + { + "epoch": 0.2095, + "grad_norm": 2.328125, + "grad_norm_var": 0.06763916015625, + "learning_rate": 0.0001, + "loss": 7.394, + "loss/crossentropy": 2.0071592926979065, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21447932720184326, + "step": 3352 + }, + { + "epoch": 0.209625, + "grad_norm": 2.671875, + "grad_norm_var": 0.07714436848958334, + "learning_rate": 0.0001, + "loss": 7.5678, + "loss/crossentropy": 2.3756041526794434, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23200811445713043, + "step": 3354 + }, + { + "epoch": 0.20975, + "grad_norm": 2.34375, + "grad_norm_var": 0.08035380045572917, + "learning_rate": 0.0001, + "loss": 7.4439, + "loss/crossentropy": 1.9585599303245544, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.19989113509655, + "step": 3356 + }, + { + "epoch": 0.209875, + "grad_norm": 2.640625, + "grad_norm_var": 0.08413798014322917, + "learning_rate": 0.0001, + "loss": 7.3117, + "loss/crossentropy": 2.1502009630203247, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22656698524951935, + "step": 3358 + }, + { + "epoch": 0.21, + "grad_norm": 2.375, + "grad_norm_var": 0.077978515625, + "learning_rate": 0.0001, + "loss": 7.6045, + "loss/crossentropy": 2.3314971923828125, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2628200501203537, + "step": 3360 + }, + { + "epoch": 0.210125, + "grad_norm": 2.375, + "grad_norm_var": 0.07669169108072917, + "learning_rate": 0.0001, + "loss": 7.6801, + "loss/crossentropy": 2.407312273979187, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2756526470184326, + "step": 3362 + }, + { + "epoch": 0.21025, + "grad_norm": 2.578125, + "grad_norm_var": 0.06636962890625, + "learning_rate": 0.0001, + "loss": 7.7946, + "loss/crossentropy": 2.308638334274292, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22128069400787354, + "step": 3364 + }, + { + "epoch": 0.210375, + "grad_norm": 2.625, + "grad_norm_var": 0.048192342122395836, + "learning_rate": 0.0001, + "loss": 7.4601, + "loss/crossentropy": 2.227054715156555, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.223979651927948, + "step": 3366 + }, + { + "epoch": 0.2105, + "grad_norm": 2.25, + "grad_norm_var": 0.04175516764322917, + "learning_rate": 0.0001, + "loss": 7.3921, + "loss/crossentropy": 2.1322200298309326, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.25718845427036285, + "step": 3368 + }, + { + "epoch": 0.210625, + "grad_norm": 2.640625, + "grad_norm_var": 0.03443603515625, + "learning_rate": 0.0001, + "loss": 7.5104, + "loss/crossentropy": 2.1420618891716003, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2320111319422722, + "step": 3370 + }, + { + "epoch": 0.21075, + "grad_norm": 2.3125, + "grad_norm_var": 0.03416239420572917, + "learning_rate": 0.0001, + "loss": 7.7232, + "loss/crossentropy": 2.3579647541046143, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2402171939611435, + "step": 3372 + }, + { + "epoch": 0.210875, + "grad_norm": 3.34375, + "grad_norm_var": 0.07431233723958333, + "learning_rate": 0.0001, + "loss": 7.4416, + "loss/crossentropy": 2.3642072677612305, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22273917496204376, + "step": 3374 + }, + { + "epoch": 0.211, + "grad_norm": 2.328125, + "grad_norm_var": 0.07669270833333333, + "learning_rate": 0.0001, + "loss": 7.4455, + "loss/crossentropy": 2.2815465927124023, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23410624265670776, + "step": 3376 + }, + { + "epoch": 0.211125, + "grad_norm": 2.546875, + "grad_norm_var": 0.07057291666666667, + "learning_rate": 0.0001, + "loss": 7.5148, + "loss/crossentropy": 2.066421687602997, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.24254445731639862, + "step": 3378 + }, + { + "epoch": 0.21125, + "grad_norm": 2.453125, + "grad_norm_var": 0.06165364583333333, + "learning_rate": 0.0001, + "loss": 7.5341, + "loss/crossentropy": 2.347060441970825, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.25217752158641815, + "step": 3380 + }, + { + "epoch": 0.211375, + "grad_norm": 2.640625, + "grad_norm_var": 0.06360270182291666, + "learning_rate": 0.0001, + "loss": 7.7507, + "loss/crossentropy": 2.1539812088012695, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.26475973427295685, + "step": 3382 + }, + { + "epoch": 0.2115, + "grad_norm": 2.609375, + "grad_norm_var": 0.0606353759765625, + "learning_rate": 0.0001, + "loss": 7.7259, + "loss/crossentropy": 2.1600695848464966, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2167871668934822, + "step": 3384 + }, + { + "epoch": 0.211625, + "grad_norm": 2.828125, + "grad_norm_var": 0.06337788899739584, + "learning_rate": 0.0001, + "loss": 7.6085, + "loss/crossentropy": 2.286532163619995, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2332899570465088, + "step": 3386 + }, + { + "epoch": 0.21175, + "grad_norm": 2.515625, + "grad_norm_var": 0.05858968098958333, + "learning_rate": 0.0001, + "loss": 7.562, + "loss/crossentropy": 2.1641604900360107, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2164478898048401, + "step": 3388 + }, + { + "epoch": 0.211875, + "grad_norm": 2.84375, + "grad_norm_var": 0.0242095947265625, + "learning_rate": 0.0001, + "loss": 7.7682, + "loss/crossentropy": 2.3409924507141113, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23107275366783142, + "step": 3390 + }, + { + "epoch": 0.212, + "grad_norm": 2.65625, + "grad_norm_var": 0.019364420572916666, + "learning_rate": 0.0001, + "loss": 7.5186, + "loss/crossentropy": 2.2500524520874023, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.213922381401062, + "step": 3392 + }, + { + "epoch": 0.212125, + "grad_norm": 2.328125, + "grad_norm_var": 0.038426717122395836, + "learning_rate": 0.0001, + "loss": 7.5321, + "loss/crossentropy": 2.1275582909584045, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22479213774204254, + "step": 3394 + }, + { + "epoch": 0.21225, + "grad_norm": 2.578125, + "grad_norm_var": 0.0404449462890625, + "learning_rate": 0.0001, + "loss": 7.5174, + "loss/crossentropy": 2.212312698364258, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24751190841197968, + "step": 3396 + }, + { + "epoch": 0.212375, + "grad_norm": 2.328125, + "grad_norm_var": 0.042236328125, + "learning_rate": 0.0001, + "loss": 7.4043, + "loss/crossentropy": 1.9285815954208374, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.19548063725233078, + "step": 3398 + }, + { + "epoch": 0.2125, + "grad_norm": 2.5625, + "grad_norm_var": 0.0437408447265625, + "learning_rate": 0.0001, + "loss": 7.6525, + "loss/crossentropy": 2.2272496223449707, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.24632243812084198, + "step": 3400 + }, + { + "epoch": 0.212625, + "grad_norm": 2.46875, + "grad_norm_var": 0.03942057291666667, + "learning_rate": 0.0001, + "loss": 7.6239, + "loss/crossentropy": 2.521925210952759, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23339618742465973, + "step": 3402 + }, + { + "epoch": 0.21275, + "grad_norm": 2.296875, + "grad_norm_var": 0.04421284993489583, + "learning_rate": 0.0001, + "loss": 7.5628, + "loss/crossentropy": 2.089443802833557, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2286679819226265, + "step": 3404 + }, + { + "epoch": 0.212875, + "grad_norm": 2.59375, + "grad_norm_var": 0.03938700358072917, + "learning_rate": 0.0001, + "loss": 7.6109, + "loss/crossentropy": 2.3914257287979126, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23510746657848358, + "step": 3406 + }, + { + "epoch": 0.213, + "grad_norm": 2.171875, + "grad_norm_var": 0.038863118489583334, + "learning_rate": 0.0001, + "loss": 7.456, + "loss/crossentropy": 2.1810909509658813, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21339446306228638, + "step": 3408 + }, + { + "epoch": 0.213125, + "grad_norm": 2.640625, + "grad_norm_var": 0.031126912434895834, + "learning_rate": 0.0001, + "loss": 7.6262, + "loss/crossentropy": 2.068147301673889, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23478543758392334, + "step": 3410 + }, + { + "epoch": 0.21325, + "grad_norm": 2.4375, + "grad_norm_var": 0.022786458333333332, + "learning_rate": 0.0001, + "loss": 7.6825, + "loss/crossentropy": 2.299628734588623, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24276187270879745, + "step": 3412 + }, + { + "epoch": 0.213375, + "grad_norm": 2.53125, + "grad_norm_var": 0.0220703125, + "learning_rate": 0.0001, + "loss": 7.5285, + "loss/crossentropy": 2.539917826652527, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24582375586032867, + "step": 3414 + }, + { + "epoch": 0.2135, + "grad_norm": 2.578125, + "grad_norm_var": 0.016039021809895835, + "learning_rate": 0.0001, + "loss": 7.7053, + "loss/crossentropy": 2.2273647785186768, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.24760635942220688, + "step": 3416 + }, + { + "epoch": 0.213625, + "grad_norm": 2.421875, + "grad_norm_var": 0.023143513997395834, + "learning_rate": 0.0001, + "loss": 7.5504, + "loss/crossentropy": 2.0793908834457397, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.20350514352321625, + "step": 3418 + }, + { + "epoch": 0.21375, + "grad_norm": 2.6875, + "grad_norm_var": 0.030887858072916666, + "learning_rate": 0.0001, + "loss": 7.6719, + "loss/crossentropy": 2.2238458395004272, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23141219466924667, + "step": 3420 + }, + { + "epoch": 0.213875, + "grad_norm": 2.78125, + "grad_norm_var": 0.03818359375, + "learning_rate": 0.0001, + "loss": 7.6894, + "loss/crossentropy": 2.2496371269226074, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.24826089292764664, + "step": 3422 + }, + { + "epoch": 0.214, + "grad_norm": 2.453125, + "grad_norm_var": 0.031083170572916666, + "learning_rate": 0.0001, + "loss": 7.5904, + "loss/crossentropy": 2.2201942205429077, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.22636444121599197, + "step": 3424 + }, + { + "epoch": 0.214125, + "grad_norm": 2.5625, + "grad_norm_var": 0.029117838541666666, + "learning_rate": 0.0001, + "loss": 7.7162, + "loss/crossentropy": 2.387493371963501, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23339012265205383, + "step": 3426 + }, + { + "epoch": 0.21425, + "grad_norm": 2.625, + "grad_norm_var": 0.038263956705729164, + "learning_rate": 0.0001, + "loss": 7.4986, + "loss/crossentropy": 2.1064809560775757, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2052219733595848, + "step": 3428 + }, + { + "epoch": 0.214375, + "grad_norm": 2.5, + "grad_norm_var": 0.043603515625, + "learning_rate": 0.0001, + "loss": 7.7774, + "loss/crossentropy": 2.430534839630127, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2444680631160736, + "step": 3430 + }, + { + "epoch": 0.2145, + "grad_norm": 2.4375, + "grad_norm_var": 0.0421539306640625, + "learning_rate": 0.0001, + "loss": 7.6541, + "loss/crossentropy": 2.2829513549804688, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.23072397708892822, + "step": 3432 + }, + { + "epoch": 0.214625, + "grad_norm": 2.4375, + "grad_norm_var": 0.047215779622395836, + "learning_rate": 0.0001, + "loss": 7.4546, + "loss/crossentropy": 2.240882158279419, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24445254355669022, + "step": 3434 + }, + { + "epoch": 0.21475, + "grad_norm": 2.8125, + "grad_norm_var": 0.04260660807291667, + "learning_rate": 0.0001, + "loss": 7.585, + "loss/crossentropy": 1.915956974029541, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.20426518470048904, + "step": 3436 + }, + { + "epoch": 0.214875, + "grad_norm": 2.484375, + "grad_norm_var": 0.038141886393229164, + "learning_rate": 0.0001, + "loss": 7.6817, + "loss/crossentropy": 2.0629988312721252, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.23025204241275787, + "step": 3438 + }, + { + "epoch": 0.215, + "grad_norm": 2.390625, + "grad_norm_var": 0.0386627197265625, + "learning_rate": 0.0001, + "loss": 7.4915, + "loss/crossentropy": 2.1499475240707397, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2223310023546219, + "step": 3440 + }, + { + "epoch": 0.215125, + "grad_norm": 2.640625, + "grad_norm_var": 0.0455474853515625, + "learning_rate": 0.0001, + "loss": 7.6482, + "loss/crossentropy": 2.323388457298279, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.23185274004936218, + "step": 3442 + }, + { + "epoch": 0.21525, + "grad_norm": 2.359375, + "grad_norm_var": 0.0388580322265625, + "learning_rate": 0.0001, + "loss": 7.6131, + "loss/crossentropy": 2.4512449502944946, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2369954064488411, + "step": 3444 + }, + { + "epoch": 0.215375, + "grad_norm": 2.4375, + "grad_norm_var": 0.032373046875, + "learning_rate": 0.0001, + "loss": 7.637, + "loss/crossentropy": 2.382017970085144, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.23922354727983475, + "step": 3446 + }, + { + "epoch": 0.2155, + "grad_norm": 2.390625, + "grad_norm_var": 0.03400065104166667, + "learning_rate": 0.0001, + "loss": 7.5746, + "loss/crossentropy": 2.27813720703125, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2455897182226181, + "step": 3448 + }, + { + "epoch": 0.215625, + "grad_norm": 2.5, + "grad_norm_var": 0.024247233072916666, + "learning_rate": 0.0001, + "loss": 7.6352, + "loss/crossentropy": 2.0362807512283325, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.24546430259943008, + "step": 3450 + }, + { + "epoch": 0.21575, + "grad_norm": 2.390625, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 7.5596, + "loss/crossentropy": 2.3591285943984985, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2355157434940338, + "step": 3452 + }, + { + "epoch": 0.215875, + "grad_norm": 2.5625, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 7.4538, + "loss/crossentropy": 2.0467506051063538, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2161174640059471, + "step": 3454 + }, + { + "epoch": 0.216, + "grad_norm": 2.234375, + "grad_norm_var": 0.020247395833333334, + "learning_rate": 0.0001, + "loss": 7.744, + "loss/crossentropy": 2.383505702018738, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23139237612485886, + "step": 3456 + }, + { + "epoch": 0.216125, + "grad_norm": 2.46875, + "grad_norm_var": 0.007906087239583333, + "learning_rate": 0.0001, + "loss": 7.6497, + "loss/crossentropy": 2.422740340232849, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2240883857011795, + "step": 3458 + }, + { + "epoch": 0.21625, + "grad_norm": 2.4375, + "grad_norm_var": 0.0074615478515625, + "learning_rate": 0.0001, + "loss": 7.6628, + "loss/crossentropy": 2.494243025779724, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2455742135643959, + "step": 3460 + }, + { + "epoch": 0.216375, + "grad_norm": 2.546875, + "grad_norm_var": 0.0082672119140625, + "learning_rate": 0.0001, + "loss": 7.5369, + "loss/crossentropy": 2.1609995365142822, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2432066798210144, + "step": 3462 + }, + { + "epoch": 0.2165, + "grad_norm": 2.359375, + "grad_norm_var": 0.0080230712890625, + "learning_rate": 0.0001, + "loss": 7.6719, + "loss/crossentropy": 2.152750015258789, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.22096887230873108, + "step": 3464 + }, + { + "epoch": 0.216625, + "grad_norm": 2.5625, + "grad_norm_var": 0.008275349934895834, + "learning_rate": 0.0001, + "loss": 7.6158, + "loss/crossentropy": 2.2551519870758057, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22652066498994827, + "step": 3466 + }, + { + "epoch": 0.21675, + "grad_norm": 2.390625, + "grad_norm_var": 0.0111236572265625, + "learning_rate": 0.0001, + "loss": 7.6574, + "loss/crossentropy": 2.2164549827575684, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23051265627145767, + "step": 3468 + }, + { + "epoch": 0.216875, + "grad_norm": 2.328125, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 7.5793, + "loss/crossentropy": 2.5120718479156494, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24404268711805344, + "step": 3470 + }, + { + "epoch": 0.217, + "grad_norm": 2.390625, + "grad_norm_var": 0.009358723958333334, + "learning_rate": 0.0001, + "loss": 7.4738, + "loss/crossentropy": 2.171375274658203, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23342353105545044, + "step": 3472 + }, + { + "epoch": 0.217125, + "grad_norm": 2.34375, + "grad_norm_var": 0.00963134765625, + "learning_rate": 0.0001, + "loss": 7.6139, + "loss/crossentropy": 2.095268964767456, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23750004172325134, + "step": 3474 + }, + { + "epoch": 0.21725, + "grad_norm": 2.40625, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 7.6385, + "loss/crossentropy": 2.187831997871399, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21564120054244995, + "step": 3476 + }, + { + "epoch": 0.217375, + "grad_norm": 2.375, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 7.649, + "loss/crossentropy": 2.339821934700012, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24484576284885406, + "step": 3478 + }, + { + "epoch": 0.2175, + "grad_norm": 2.46875, + "grad_norm_var": 0.010188802083333334, + "learning_rate": 0.0001, + "loss": 7.7182, + "loss/crossentropy": 2.085095524787903, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21802203357219696, + "step": 3480 + }, + { + "epoch": 0.217625, + "grad_norm": 2.453125, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 7.6739, + "loss/crossentropy": 2.4005582332611084, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22154852002859116, + "step": 3482 + }, + { + "epoch": 0.21775, + "grad_norm": 4.375, + "grad_norm_var": 0.24265034993489584, + "learning_rate": 0.0001, + "loss": 7.5902, + "loss/crossentropy": 2.235607147216797, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.3020609989762306, + "step": 3484 + }, + { + "epoch": 0.217875, + "grad_norm": 2.703125, + "grad_norm_var": 0.2362457275390625, + "learning_rate": 0.0001, + "loss": 7.5921, + "loss/crossentropy": 2.37876033782959, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.238236665725708, + "step": 3486 + }, + { + "epoch": 0.218, + "grad_norm": 2.828125, + "grad_norm_var": 0.23375244140625, + "learning_rate": 0.0001, + "loss": 7.7093, + "loss/crossentropy": 2.446916341781616, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2403736189007759, + "step": 3488 + }, + { + "epoch": 0.218125, + "grad_norm": 2.28125, + "grad_norm_var": 0.23952534993489583, + "learning_rate": 0.0001, + "loss": 7.6001, + "loss/crossentropy": 2.703190565109253, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.24328448623418808, + "step": 3490 + }, + { + "epoch": 0.21825, + "grad_norm": 2.515625, + "grad_norm_var": 0.23860677083333334, + "learning_rate": 0.0001, + "loss": 7.6135, + "loss/crossentropy": 2.266252636909485, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2304093837738037, + "step": 3492 + }, + { + "epoch": 0.218375, + "grad_norm": 3.0625, + "grad_norm_var": 0.24362691243489584, + "learning_rate": 0.0001, + "loss": 7.6869, + "loss/crossentropy": 2.2512608766555786, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23177290707826614, + "step": 3494 + }, + { + "epoch": 0.2185, + "grad_norm": 2.390625, + "grad_norm_var": 0.24628804524739584, + "learning_rate": 0.0001, + "loss": 7.8453, + "loss/crossentropy": 2.3944746255874634, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22361087799072266, + "step": 3496 + }, + { + "epoch": 0.218625, + "grad_norm": 2.34375, + "grad_norm_var": 0.25588785807291664, + "learning_rate": 0.0001, + "loss": 7.3925, + "loss/crossentropy": 1.9911410212516785, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2150706946849823, + "step": 3498 + }, + { + "epoch": 0.21875, + "grad_norm": 2.84375, + "grad_norm_var": 0.05894266764322917, + "learning_rate": 0.0001, + "loss": 7.6356, + "loss/crossentropy": 2.290215253829956, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22882460057735443, + "step": 3500 + }, + { + "epoch": 0.218875, + "grad_norm": 2.15625, + "grad_norm_var": 0.07018941243489583, + "learning_rate": 0.0001, + "loss": 7.5163, + "loss/crossentropy": 2.201116681098938, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.20892898738384247, + "step": 3502 + }, + { + "epoch": 0.219, + "grad_norm": 2.5, + "grad_norm_var": 0.06816304524739583, + "learning_rate": 0.0001, + "loss": 7.7937, + "loss/crossentropy": 2.210504412651062, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.21811866015195847, + "step": 3504 + }, + { + "epoch": 0.219125, + "grad_norm": 2.5625, + "grad_norm_var": 0.06256510416666666, + "learning_rate": 0.0001, + "loss": 7.6281, + "loss/crossentropy": 2.355741262435913, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23753728717565536, + "step": 3506 + }, + { + "epoch": 0.21925, + "grad_norm": 2.296875, + "grad_norm_var": 0.06705729166666667, + "learning_rate": 0.0001, + "loss": 7.5304, + "loss/crossentropy": 2.2776095867156982, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22738799452781677, + "step": 3508 + }, + { + "epoch": 0.219375, + "grad_norm": 2.40625, + "grad_norm_var": 0.046605428059895836, + "learning_rate": 0.0001, + "loss": 7.5032, + "loss/crossentropy": 2.295590400695801, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2449091300368309, + "step": 3510 + }, + { + "epoch": 0.2195, + "grad_norm": 2.671875, + "grad_norm_var": 0.03515625, + "learning_rate": 0.0001, + "loss": 7.6288, + "loss/crossentropy": 2.326041340827942, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2660403698682785, + "step": 3512 + }, + { + "epoch": 0.219625, + "grad_norm": 2.28125, + "grad_norm_var": 0.03609619140625, + "learning_rate": 0.0001, + "loss": 7.5858, + "loss/crossentropy": 2.432402729988098, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.25160669535398483, + "step": 3514 + }, + { + "epoch": 0.21975, + "grad_norm": 2.890625, + "grad_norm_var": 0.038895670572916666, + "learning_rate": 0.0001, + "loss": 7.818, + "loss/crossentropy": 2.349487543106079, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23811831325292587, + "step": 3516 + }, + { + "epoch": 0.219875, + "grad_norm": 2.546875, + "grad_norm_var": 0.0508941650390625, + "learning_rate": 0.0001, + "loss": 7.7315, + "loss/crossentropy": 2.3854116201400757, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24041782319545746, + "step": 3518 + }, + { + "epoch": 0.22, + "grad_norm": 2.265625, + "grad_norm_var": 0.048075358072916664, + "learning_rate": 0.0001, + "loss": 7.5721, + "loss/crossentropy": 2.253451347351074, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2183404043316841, + "step": 3520 + }, + { + "epoch": 0.220125, + "grad_norm": 2.453125, + "grad_norm_var": 0.04853108723958333, + "learning_rate": 0.0001, + "loss": 7.4311, + "loss/crossentropy": 1.989369809627533, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.218755841255188, + "step": 3522 + }, + { + "epoch": 0.22025, + "grad_norm": 2.390625, + "grad_norm_var": 0.046187337239583334, + "learning_rate": 0.0001, + "loss": 7.5011, + "loss/crossentropy": 2.3954477310180664, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.254411518573761, + "step": 3524 + }, + { + "epoch": 0.220375, + "grad_norm": 2.53125, + "grad_norm_var": 0.04576416015625, + "learning_rate": 0.0001, + "loss": 7.5453, + "loss/crossentropy": 1.9688183665275574, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2124113142490387, + "step": 3526 + }, + { + "epoch": 0.2205, + "grad_norm": 2.40625, + "grad_norm_var": 0.04296468098958333, + "learning_rate": 0.0001, + "loss": 7.5413, + "loss/crossentropy": 2.304739475250244, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24053708463907242, + "step": 3528 + }, + { + "epoch": 0.220625, + "grad_norm": 2.40625, + "grad_norm_var": 0.040282185872395834, + "learning_rate": 0.0001, + "loss": 7.5082, + "loss/crossentropy": 2.188236117362976, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2476644068956375, + "step": 3530 + }, + { + "epoch": 0.22075, + "grad_norm": 2.28125, + "grad_norm_var": 0.03193257649739583, + "learning_rate": 0.0001, + "loss": 7.4935, + "loss/crossentropy": 2.452435612678528, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2143031656742096, + "step": 3532 + }, + { + "epoch": 0.220875, + "grad_norm": 2.21875, + "grad_norm_var": 0.012613932291666666, + "learning_rate": 0.0001, + "loss": 7.4794, + "loss/crossentropy": 2.0797160863876343, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2122730240225792, + "step": 3534 + }, + { + "epoch": 0.221, + "grad_norm": 2.484375, + "grad_norm_var": 0.010595703125, + "learning_rate": 0.0001, + "loss": 7.4632, + "loss/crossentropy": 2.4266319274902344, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2510553449392319, + "step": 3536 + }, + { + "epoch": 0.221125, + "grad_norm": 2.203125, + "grad_norm_var": 0.012450154622395833, + "learning_rate": 0.0001, + "loss": 7.6415, + "loss/crossentropy": 2.059949517250061, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23185402899980545, + "step": 3538 + }, + { + "epoch": 0.22125, + "grad_norm": 2.53125, + "grad_norm_var": 0.01265869140625, + "learning_rate": 0.0001, + "loss": 7.6176, + "loss/crossentropy": 2.5653083324432373, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23594635725021362, + "step": 3540 + }, + { + "epoch": 0.221375, + "grad_norm": 2.265625, + "grad_norm_var": 0.013667805989583334, + "learning_rate": 0.0001, + "loss": 7.481, + "loss/crossentropy": 2.3301087617874146, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2452377825975418, + "step": 3542 + }, + { + "epoch": 0.2215, + "grad_norm": 2.34375, + "grad_norm_var": 0.0131011962890625, + "learning_rate": 0.0001, + "loss": 7.5606, + "loss/crossentropy": 2.2409706115722656, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2660795971751213, + "step": 3544 + }, + { + "epoch": 0.221625, + "grad_norm": 2.65625, + "grad_norm_var": 0.020589192708333332, + "learning_rate": 0.0001, + "loss": 7.6275, + "loss/crossentropy": 2.341962456703186, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23292769491672516, + "step": 3546 + }, + { + "epoch": 0.22175, + "grad_norm": 2.546875, + "grad_norm_var": 0.020503743489583334, + "learning_rate": 0.0001, + "loss": 7.5761, + "loss/crossentropy": 2.27796733379364, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22092991322278976, + "step": 3548 + }, + { + "epoch": 0.221875, + "grad_norm": 2.375, + "grad_norm_var": 0.0172515869140625, + "learning_rate": 0.0001, + "loss": 7.5742, + "loss/crossentropy": 2.189824938774109, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23947207629680634, + "step": 3550 + }, + { + "epoch": 0.222, + "grad_norm": 2.453125, + "grad_norm_var": 0.017186482747395832, + "learning_rate": 0.0001, + "loss": 7.5813, + "loss/crossentropy": 2.4192023277282715, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.24019119888544083, + "step": 3552 + }, + { + "epoch": 0.222125, + "grad_norm": 2.375, + "grad_norm_var": 0.0127838134765625, + "learning_rate": 0.0001, + "loss": 7.5347, + "loss/crossentropy": 2.158234715461731, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2341742068529129, + "step": 3554 + }, + { + "epoch": 0.22225, + "grad_norm": 3.203125, + "grad_norm_var": 0.047587076822916664, + "learning_rate": 0.0001, + "loss": 7.378, + "loss/crossentropy": 2.2715145349502563, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22904645651578903, + "step": 3556 + }, + { + "epoch": 0.222375, + "grad_norm": 2.375, + "grad_norm_var": 0.26266276041666664, + "learning_rate": 0.0001, + "loss": 7.6181, + "loss/crossentropy": 2.2331241369247437, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.25211699306964874, + "step": 3558 + }, + { + "epoch": 0.2225, + "grad_norm": 2.375, + "grad_norm_var": 0.2688140869140625, + "learning_rate": 0.0001, + "loss": 7.5199, + "loss/crossentropy": 1.9353508949279785, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2270013615489006, + "step": 3560 + }, + { + "epoch": 0.222625, + "grad_norm": 2.40625, + "grad_norm_var": 0.27647196451822914, + "learning_rate": 0.0001, + "loss": 7.5403, + "loss/crossentropy": 2.4099196195602417, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.24662812799215317, + "step": 3562 + }, + { + "epoch": 0.22275, + "grad_norm": 2.71875, + "grad_norm_var": 0.27763264973958335, + "learning_rate": 0.0001, + "loss": 7.7591, + "loss/crossentropy": 2.163403630256653, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.22525054216384888, + "step": 3564 + }, + { + "epoch": 0.222875, + "grad_norm": 2.484375, + "grad_norm_var": 0.27356363932291666, + "learning_rate": 0.0001, + "loss": 7.6724, + "loss/crossentropy": 2.3632307052612305, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.221424400806427, + "step": 3566 + }, + { + "epoch": 0.223, + "grad_norm": 2.359375, + "grad_norm_var": 0.2787394205729167, + "learning_rate": 0.0001, + "loss": 7.4518, + "loss/crossentropy": 2.079641282558441, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22443066537380219, + "step": 3568 + }, + { + "epoch": 0.223125, + "grad_norm": 2.578125, + "grad_norm_var": 0.28662109375, + "learning_rate": 0.0001, + "loss": 7.5553, + "loss/crossentropy": 2.0237990021705627, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21297388523817062, + "step": 3570 + }, + { + "epoch": 0.22325, + "grad_norm": 2.625, + "grad_norm_var": 0.26023763020833335, + "learning_rate": 0.0001, + "loss": 7.9066, + "loss/crossentropy": 2.4305167198181152, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24352750182151794, + "step": 3572 + }, + { + "epoch": 0.223375, + "grad_norm": 2.453125, + "grad_norm_var": 0.03300679524739583, + "learning_rate": 0.0001, + "loss": 7.8042, + "loss/crossentropy": 2.296820282936096, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23312810063362122, + "step": 3574 + }, + { + "epoch": 0.2235, + "grad_norm": 2.65625, + "grad_norm_var": 0.029866536458333332, + "learning_rate": 0.0001, + "loss": 7.7462, + "loss/crossentropy": 2.3262380361557007, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.22458947449922562, + "step": 3576 + }, + { + "epoch": 0.223625, + "grad_norm": 2.6875, + "grad_norm_var": 0.027179972330729166, + "learning_rate": 0.0001, + "loss": 7.6477, + "loss/crossentropy": 2.4565069675445557, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2450973466038704, + "step": 3578 + }, + { + "epoch": 0.22375, + "grad_norm": 2.765625, + "grad_norm_var": 0.028913370768229165, + "learning_rate": 0.0001, + "loss": 7.5068, + "loss/crossentropy": 1.9984254240989685, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.19529377669095993, + "step": 3580 + }, + { + "epoch": 0.223875, + "grad_norm": 2.5, + "grad_norm_var": 0.02584228515625, + "learning_rate": 0.0001, + "loss": 7.5913, + "loss/crossentropy": 2.2899560928344727, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.25301460921764374, + "step": 3582 + }, + { + "epoch": 0.224, + "grad_norm": 2.421875, + "grad_norm_var": 0.026764933268229166, + "learning_rate": 0.0001, + "loss": 7.5833, + "loss/crossentropy": 2.1312328577041626, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.22322946041822433, + "step": 3584 + }, + { + "epoch": 0.224125, + "grad_norm": 2.421875, + "grad_norm_var": 0.017867024739583334, + "learning_rate": 0.0001, + "loss": 7.6728, + "loss/crossentropy": 2.5150551795959473, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2675798535346985, + "step": 3586 + }, + { + "epoch": 0.22425, + "grad_norm": 2.5, + "grad_norm_var": 0.0173492431640625, + "learning_rate": 0.0001, + "loss": 7.5826, + "loss/crossentropy": 2.1855462789535522, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.21716494113206863, + "step": 3588 + }, + { + "epoch": 0.224375, + "grad_norm": 2.328125, + "grad_norm_var": 0.0208892822265625, + "learning_rate": 0.0001, + "loss": 7.4135, + "loss/crossentropy": 2.4238641262054443, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.23732400685548782, + "step": 3590 + }, + { + "epoch": 0.2245, + "grad_norm": 2.46875, + "grad_norm_var": 0.020002237955729165, + "learning_rate": 0.0001, + "loss": 7.5734, + "loss/crossentropy": 2.313872456550598, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2245703637599945, + "step": 3592 + }, + { + "epoch": 0.224625, + "grad_norm": 2.421875, + "grad_norm_var": 0.017822265625, + "learning_rate": 0.0001, + "loss": 7.6389, + "loss/crossentropy": 2.149410605430603, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22927816212177277, + "step": 3594 + }, + { + "epoch": 0.22475, + "grad_norm": 2.671875, + "grad_norm_var": 0.013117472330729166, + "learning_rate": 0.0001, + "loss": 7.594, + "loss/crossentropy": 2.228400468826294, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23162036389112473, + "step": 3596 + }, + { + "epoch": 0.224875, + "grad_norm": 2.3125, + "grad_norm_var": 0.0120758056640625, + "learning_rate": 0.0001, + "loss": 7.6038, + "loss/crossentropy": 2.181916832923889, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.21812283247709274, + "step": 3598 + }, + { + "epoch": 0.225, + "grad_norm": 2.484375, + "grad_norm_var": 0.0110015869140625, + "learning_rate": 0.0001, + "loss": 7.7592, + "loss/crossentropy": 2.3809478282928467, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2515050619840622, + "step": 3600 + }, + { + "epoch": 0.225125, + "grad_norm": 2.34375, + "grad_norm_var": 0.011324055989583333, + "learning_rate": 0.0001, + "loss": 7.5347, + "loss/crossentropy": 2.1800352334976196, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20716014504432678, + "step": 3602 + }, + { + "epoch": 0.22525, + "grad_norm": 2.328125, + "grad_norm_var": 0.01744384765625, + "learning_rate": 0.0001, + "loss": 7.5785, + "loss/crossentropy": 2.423554301261902, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24492305517196655, + "step": 3604 + }, + { + "epoch": 0.225375, + "grad_norm": 2.78125, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 7.7038, + "loss/crossentropy": 2.3537445068359375, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2731524705886841, + "step": 3606 + }, + { + "epoch": 0.2255, + "grad_norm": 2.609375, + "grad_norm_var": 0.04365234375, + "learning_rate": 0.0001, + "loss": 7.7234, + "loss/crossentropy": 2.3835121393203735, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22908172756433487, + "step": 3608 + }, + { + "epoch": 0.225625, + "grad_norm": 2.296875, + "grad_norm_var": 0.04636942545572917, + "learning_rate": 0.0001, + "loss": 7.6703, + "loss/crossentropy": 2.6490813493728638, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22531013935804367, + "step": 3610 + }, + { + "epoch": 0.22575, + "grad_norm": 2.34375, + "grad_norm_var": 0.05110575358072917, + "learning_rate": 0.0001, + "loss": 7.5759, + "loss/crossentropy": 2.1871402263641357, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.22304056584835052, + "step": 3612 + }, + { + "epoch": 0.225875, + "grad_norm": 2.53125, + "grad_norm_var": 0.0443511962890625, + "learning_rate": 0.0001, + "loss": 7.4932, + "loss/crossentropy": 2.2581640481948853, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.24387839436531067, + "step": 3614 + }, + { + "epoch": 0.226, + "grad_norm": 2.5625, + "grad_norm_var": 0.044489542643229164, + "learning_rate": 0.0001, + "loss": 7.7008, + "loss/crossentropy": 2.2254581451416016, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22644620388746262, + "step": 3616 + }, + { + "epoch": 0.226125, + "grad_norm": 2.359375, + "grad_norm_var": 0.04413960774739583, + "learning_rate": 0.0001, + "loss": 7.5712, + "loss/crossentropy": 2.0950043201446533, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24633889645338058, + "step": 3618 + }, + { + "epoch": 0.22625, + "grad_norm": 2.296875, + "grad_norm_var": 0.042292277018229164, + "learning_rate": 0.0001, + "loss": 7.6548, + "loss/crossentropy": 2.2507599592208862, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.22745974361896515, + "step": 3620 + }, + { + "epoch": 0.226375, + "grad_norm": 2.609375, + "grad_norm_var": 0.032613118489583336, + "learning_rate": 0.0001, + "loss": 7.6329, + "loss/crossentropy": 2.114292323589325, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22122054547071457, + "step": 3622 + }, + { + "epoch": 0.2265, + "grad_norm": 2.296875, + "grad_norm_var": 0.011823527018229167, + "learning_rate": 0.0001, + "loss": 7.532, + "loss/crossentropy": 2.210999310016632, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22689851373434067, + "step": 3624 + }, + { + "epoch": 0.226625, + "grad_norm": 2.34375, + "grad_norm_var": 0.014583333333333334, + "learning_rate": 0.0001, + "loss": 7.4216, + "loss/crossentropy": 2.5903425216674805, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22328981757164001, + "step": 3626 + }, + { + "epoch": 0.22675, + "grad_norm": 2.453125, + "grad_norm_var": 0.01275634765625, + "learning_rate": 0.0001, + "loss": 7.5963, + "loss/crossentropy": 2.3979709148406982, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2354452759027481, + "step": 3628 + }, + { + "epoch": 0.226875, + "grad_norm": 2.34375, + "grad_norm_var": 0.01240234375, + "learning_rate": 0.0001, + "loss": 7.4937, + "loss/crossentropy": 2.308434844017029, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22410962730646133, + "step": 3630 + }, + { + "epoch": 0.227, + "grad_norm": 2.140625, + "grad_norm_var": 0.01529541015625, + "learning_rate": 0.0001, + "loss": 7.5951, + "loss/crossentropy": 2.4131675958633423, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24779768288135529, + "step": 3632 + }, + { + "epoch": 0.227125, + "grad_norm": 2.796875, + "grad_norm_var": 0.024723307291666666, + "learning_rate": 0.0001, + "loss": 7.5681, + "loss/crossentropy": 2.2655831575393677, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2255219966173172, + "step": 3634 + }, + { + "epoch": 0.22725, + "grad_norm": 2.25, + "grad_norm_var": 0.025487263997395832, + "learning_rate": 0.0001, + "loss": 7.4176, + "loss/crossentropy": 2.3867735862731934, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.24707113206386566, + "step": 3636 + }, + { + "epoch": 0.227375, + "grad_norm": 2.46875, + "grad_norm_var": 0.022834269205729167, + "learning_rate": 0.0001, + "loss": 7.532, + "loss/crossentropy": 2.3354828357696533, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2387101650238037, + "step": 3638 + }, + { + "epoch": 0.2275, + "grad_norm": 2.4375, + "grad_norm_var": 0.022581990559895834, + "learning_rate": 0.0001, + "loss": 7.4607, + "loss/crossentropy": 2.164547324180603, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21518608927726746, + "step": 3640 + }, + { + "epoch": 0.227625, + "grad_norm": 2.5, + "grad_norm_var": 0.0202056884765625, + "learning_rate": 0.0001, + "loss": 7.6245, + "loss/crossentropy": 2.3133569955825806, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2465798780322075, + "step": 3642 + }, + { + "epoch": 0.22775, + "grad_norm": 2.359375, + "grad_norm_var": 0.02066650390625, + "learning_rate": 0.0001, + "loss": 7.5408, + "loss/crossentropy": 2.3382097482681274, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22838010638952255, + "step": 3644 + }, + { + "epoch": 0.227875, + "grad_norm": 2.484375, + "grad_norm_var": 0.0206695556640625, + "learning_rate": 0.0001, + "loss": 7.4479, + "loss/crossentropy": 2.2210439443588257, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21334365755319595, + "step": 3646 + }, + { + "epoch": 0.228, + "grad_norm": 2.3125, + "grad_norm_var": 0.019310506184895833, + "learning_rate": 0.0001, + "loss": 7.5992, + "loss/crossentropy": 2.1229522228240967, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.23556266725063324, + "step": 3648 + }, + { + "epoch": 0.228125, + "grad_norm": 2.421875, + "grad_norm_var": 0.009129842122395834, + "learning_rate": 0.0001, + "loss": 7.5133, + "loss/crossentropy": 2.335044503211975, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.25041337311267853, + "step": 3650 + }, + { + "epoch": 0.22825, + "grad_norm": 2.4375, + "grad_norm_var": 0.01851806640625, + "learning_rate": 0.0001, + "loss": 7.5085, + "loss/crossentropy": 2.1980998516082764, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21712280064821243, + "step": 3652 + }, + { + "epoch": 0.228375, + "grad_norm": 2.53125, + "grad_norm_var": 0.021100870768229165, + "learning_rate": 0.0001, + "loss": 7.7472, + "loss/crossentropy": 2.513595938682556, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2537487596273422, + "step": 3654 + }, + { + "epoch": 0.2285, + "grad_norm": 2.484375, + "grad_norm_var": 0.019722493489583333, + "learning_rate": 0.0001, + "loss": 7.515, + "loss/crossentropy": 2.3871147632598877, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.24276654422283173, + "step": 3656 + }, + { + "epoch": 0.228625, + "grad_norm": 2.390625, + "grad_norm_var": 0.021703084309895832, + "learning_rate": 0.0001, + "loss": 7.5239, + "loss/crossentropy": 2.449997901916504, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23972835391759872, + "step": 3658 + }, + { + "epoch": 0.22875, + "grad_norm": 2.1875, + "grad_norm_var": 0.023957316080729166, + "learning_rate": 0.0001, + "loss": 7.3961, + "loss/crossentropy": 1.9787690043449402, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21979594230651855, + "step": 3660 + }, + { + "epoch": 0.228875, + "grad_norm": 2.234375, + "grad_norm_var": 0.030777994791666666, + "learning_rate": 0.0001, + "loss": 7.4817, + "loss/crossentropy": 2.327951431274414, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144240364432335, + "step": 3662 + }, + { + "epoch": 0.229, + "grad_norm": 2.28125, + "grad_norm_var": 0.028954060872395833, + "learning_rate": 0.0001, + "loss": 7.4867, + "loss/crossentropy": 2.2228533029556274, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2117415815591812, + "step": 3664 + }, + { + "epoch": 0.229125, + "grad_norm": 2.40625, + "grad_norm_var": 0.030817667643229168, + "learning_rate": 0.0001, + "loss": 7.5446, + "loss/crossentropy": 2.5074501037597656, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2416432872414589, + "step": 3666 + }, + { + "epoch": 0.22925, + "grad_norm": 2.671875, + "grad_norm_var": 0.023502604166666666, + "learning_rate": 0.0001, + "loss": 7.5777, + "loss/crossentropy": 2.1035314798355103, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21771013736724854, + "step": 3668 + }, + { + "epoch": 0.229375, + "grad_norm": 2.484375, + "grad_norm_var": 0.026463826497395832, + "learning_rate": 0.0001, + "loss": 7.5223, + "loss/crossentropy": 2.156371831893921, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2226782739162445, + "step": 3670 + }, + { + "epoch": 0.2295, + "grad_norm": 2.34375, + "grad_norm_var": 0.0279296875, + "learning_rate": 0.0001, + "loss": 7.5712, + "loss/crossentropy": 2.3843045234680176, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23753250390291214, + "step": 3672 + }, + { + "epoch": 0.229625, + "grad_norm": 2.4375, + "grad_norm_var": 0.024128214518229166, + "learning_rate": 0.0001, + "loss": 7.6631, + "loss/crossentropy": 2.2664679288864136, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23596899211406708, + "step": 3674 + }, + { + "epoch": 0.22975, + "grad_norm": 2.5625, + "grad_norm_var": 0.024051920572916666, + "learning_rate": 0.0001, + "loss": 7.3774, + "loss/crossentropy": 2.0786343812942505, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21789100021123886, + "step": 3676 + }, + { + "epoch": 0.229875, + "grad_norm": 2.296875, + "grad_norm_var": 0.0205230712890625, + "learning_rate": 0.0001, + "loss": 7.5573, + "loss/crossentropy": 2.148313283920288, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22734209895133972, + "step": 3678 + }, + { + "epoch": 0.23, + "grad_norm": 2.453125, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 7.5015, + "loss/crossentropy": 2.3508098125457764, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22268230468034744, + "step": 3680 + }, + { + "epoch": 0.230125, + "grad_norm": 2.296875, + "grad_norm_var": 0.018773396809895832, + "learning_rate": 0.0001, + "loss": 7.5282, + "loss/crossentropy": 2.419388175010681, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2422405481338501, + "step": 3682 + }, + { + "epoch": 0.23025, + "grad_norm": 2.453125, + "grad_norm_var": 0.0137603759765625, + "learning_rate": 0.0001, + "loss": 7.5899, + "loss/crossentropy": 2.2298837900161743, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21447760611772537, + "step": 3684 + }, + { + "epoch": 0.230375, + "grad_norm": 2.546875, + "grad_norm_var": 0.013605753580729166, + "learning_rate": 0.0001, + "loss": 7.59, + "loss/crossentropy": 2.4269654750823975, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.24916177242994308, + "step": 3686 + }, + { + "epoch": 0.2305, + "grad_norm": 2.546875, + "grad_norm_var": 0.010431925455729166, + "learning_rate": 0.0001, + "loss": 7.4829, + "loss/crossentropy": 1.9996501207351685, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23489703238010406, + "step": 3688 + }, + { + "epoch": 0.230625, + "grad_norm": 2.34375, + "grad_norm_var": 0.014469401041666666, + "learning_rate": 0.0001, + "loss": 7.7753, + "loss/crossentropy": 2.3211807012557983, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.237454354763031, + "step": 3690 + }, + { + "epoch": 0.23075, + "grad_norm": 2.5, + "grad_norm_var": 0.013277180989583333, + "learning_rate": 0.0001, + "loss": 7.5813, + "loss/crossentropy": 2.1937203407287598, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2389645278453827, + "step": 3692 + }, + { + "epoch": 0.230875, + "grad_norm": 2.546875, + "grad_norm_var": 0.01240234375, + "learning_rate": 0.0001, + "loss": 7.7269, + "loss/crossentropy": 2.413442850112915, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.24230879545211792, + "step": 3694 + }, + { + "epoch": 0.231, + "grad_norm": 2.40625, + "grad_norm_var": 0.0149078369140625, + "learning_rate": 0.0001, + "loss": 7.5778, + "loss/crossentropy": 2.252353072166443, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22742437571287155, + "step": 3696 + }, + { + "epoch": 0.231125, + "grad_norm": 2.703125, + "grad_norm_var": 0.020670572916666668, + "learning_rate": 0.0001, + "loss": 7.6338, + "loss/crossentropy": 2.2866071462631226, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.22707533836364746, + "step": 3698 + }, + { + "epoch": 0.23125, + "grad_norm": 2.359375, + "grad_norm_var": 0.023470052083333335, + "learning_rate": 0.0001, + "loss": 7.4712, + "loss/crossentropy": 1.939712941646576, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.23170647770166397, + "step": 3700 + }, + { + "epoch": 0.231375, + "grad_norm": 2.515625, + "grad_norm_var": 0.022196451822916668, + "learning_rate": 0.0001, + "loss": 7.5799, + "loss/crossentropy": 1.7738837003707886, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.24364864826202393, + "step": 3702 + }, + { + "epoch": 0.2315, + "grad_norm": 2.640625, + "grad_norm_var": 0.0230865478515625, + "learning_rate": 0.0001, + "loss": 7.5375, + "loss/crossentropy": 2.1018226742744446, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22683311998844147, + "step": 3704 + }, + { + "epoch": 0.231625, + "grad_norm": 2.515625, + "grad_norm_var": 0.0196197509765625, + "learning_rate": 0.0001, + "loss": 7.6113, + "loss/crossentropy": 2.2435100078582764, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22194090485572815, + "step": 3706 + }, + { + "epoch": 0.23175, + "grad_norm": 2.390625, + "grad_norm_var": 0.019147745768229165, + "learning_rate": 0.0001, + "loss": 7.5735, + "loss/crossentropy": 2.114552319049835, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21233541518449783, + "step": 3708 + }, + { + "epoch": 0.231875, + "grad_norm": 2.46875, + "grad_norm_var": 0.020164998372395833, + "learning_rate": 0.0001, + "loss": 7.4842, + "loss/crossentropy": 2.340905785560608, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23916912078857422, + "step": 3710 + }, + { + "epoch": 0.232, + "grad_norm": 2.65625, + "grad_norm_var": 0.018675740559895834, + "learning_rate": 0.0001, + "loss": 7.5672, + "loss/crossentropy": 2.2121574878692627, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.22300127893686295, + "step": 3712 + }, + { + "epoch": 0.232125, + "grad_norm": 2.28125, + "grad_norm_var": 0.015852864583333334, + "learning_rate": 0.0001, + "loss": 7.5457, + "loss/crossentropy": 2.350710391998291, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2501726374030113, + "step": 3714 + }, + { + "epoch": 0.23225, + "grad_norm": 2.515625, + "grad_norm_var": 0.015983072916666667, + "learning_rate": 0.0001, + "loss": 7.5465, + "loss/crossentropy": 2.4245604276657104, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22501108050346375, + "step": 3716 + }, + { + "epoch": 0.232375, + "grad_norm": 2.3125, + "grad_norm_var": 0.017215983072916666, + "learning_rate": 0.0001, + "loss": 7.6819, + "loss/crossentropy": 2.2433913946151733, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23217583447694778, + "step": 3718 + }, + { + "epoch": 0.2325, + "grad_norm": 2.21875, + "grad_norm_var": 0.017194620768229165, + "learning_rate": 0.0001, + "loss": 7.462, + "loss/crossentropy": 2.203797459602356, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21597397327423096, + "step": 3720 + }, + { + "epoch": 0.232625, + "grad_norm": 2.375, + "grad_norm_var": 0.015653483072916665, + "learning_rate": 0.0001, + "loss": 7.4907, + "loss/crossentropy": 2.274090051651001, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.22629251331090927, + "step": 3722 + }, + { + "epoch": 0.23275, + "grad_norm": 2.515625, + "grad_norm_var": 0.016402180989583334, + "learning_rate": 0.0001, + "loss": 7.5762, + "loss/crossentropy": 2.1787149906158447, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21793190389871597, + "step": 3724 + }, + { + "epoch": 0.232875, + "grad_norm": 2.359375, + "grad_norm_var": 0.018001302083333334, + "learning_rate": 0.0001, + "loss": 7.5742, + "loss/crossentropy": 2.472353219985962, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2537408918142319, + "step": 3726 + }, + { + "epoch": 0.233, + "grad_norm": 2.546875, + "grad_norm_var": 0.012376912434895833, + "learning_rate": 0.0001, + "loss": 7.7366, + "loss/crossentropy": 2.1297940015792847, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24158430099487305, + "step": 3728 + }, + { + "epoch": 0.233125, + "grad_norm": 2.375, + "grad_norm_var": 0.0106109619140625, + "learning_rate": 0.0001, + "loss": 7.5799, + "loss/crossentropy": 2.307152509689331, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2116793394088745, + "step": 3730 + }, + { + "epoch": 0.23325, + "grad_norm": 2.484375, + "grad_norm_var": 0.010465494791666667, + "learning_rate": 0.0001, + "loss": 7.4555, + "loss/crossentropy": 1.9622553586959839, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.22563175857067108, + "step": 3732 + }, + { + "epoch": 0.233375, + "grad_norm": 2.46875, + "grad_norm_var": 0.011637369791666666, + "learning_rate": 0.0001, + "loss": 7.558, + "loss/crossentropy": 2.1677664518356323, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22453109174966812, + "step": 3734 + }, + { + "epoch": 0.2335, + "grad_norm": 2.578125, + "grad_norm_var": 0.09955952962239584, + "learning_rate": 0.0001, + "loss": 7.7388, + "loss/crossentropy": 2.192821502685547, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23431353271007538, + "step": 3736 + }, + { + "epoch": 0.233625, + "grad_norm": 2.53125, + "grad_norm_var": 0.10053609212239584, + "learning_rate": 0.0001, + "loss": 7.5906, + "loss/crossentropy": 2.1779892444610596, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23049022257328033, + "step": 3738 + }, + { + "epoch": 0.23375, + "grad_norm": 2.28125, + "grad_norm_var": 0.10708719889322917, + "learning_rate": 0.0001, + "loss": 7.4549, + "loss/crossentropy": 2.402343273162842, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2568327337503433, + "step": 3740 + }, + { + "epoch": 0.233875, + "grad_norm": 2.359375, + "grad_norm_var": 0.10690104166666667, + "learning_rate": 0.0001, + "loss": 7.5119, + "loss/crossentropy": 2.1869853734970093, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2425551563501358, + "step": 3742 + }, + { + "epoch": 0.234, + "grad_norm": 2.375, + "grad_norm_var": 0.11015218098958333, + "learning_rate": 0.0001, + "loss": 7.5073, + "loss/crossentropy": 2.2516770362854004, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2116701528429985, + "step": 3744 + }, + { + "epoch": 0.234125, + "grad_norm": 2.484375, + "grad_norm_var": 0.11015218098958333, + "learning_rate": 0.0001, + "loss": 7.5986, + "loss/crossentropy": 2.005309283733368, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.20876743644475937, + "step": 3746 + }, + { + "epoch": 0.23425, + "grad_norm": 2.4375, + "grad_norm_var": 0.10966695149739583, + "learning_rate": 0.0001, + "loss": 7.4817, + "loss/crossentropy": 2.5279784202575684, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23321525007486343, + "step": 3748 + }, + { + "epoch": 0.234375, + "grad_norm": 2.453125, + "grad_norm_var": 0.11015218098958333, + "learning_rate": 0.0001, + "loss": 7.5474, + "loss/crossentropy": 2.34593665599823, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.23283468186855316, + "step": 3750 + }, + { + "epoch": 0.2345, + "grad_norm": 2.1875, + "grad_norm_var": 0.024665323893229167, + "learning_rate": 0.0001, + "loss": 7.4105, + "loss/crossentropy": 2.249088764190674, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24345380067825317, + "step": 3752 + }, + { + "epoch": 0.234625, + "grad_norm": 2.328125, + "grad_norm_var": 0.01451416015625, + "learning_rate": 0.0001, + "loss": 7.5255, + "loss/crossentropy": 2.2118486166000366, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435833364725113, + "step": 3754 + }, + { + "epoch": 0.23475, + "grad_norm": 2.59375, + "grad_norm_var": 0.017411295572916666, + "learning_rate": 0.0001, + "loss": 7.6165, + "loss/crossentropy": 2.2884024381637573, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.23158583790063858, + "step": 3756 + }, + { + "epoch": 0.234875, + "grad_norm": 2.609375, + "grad_norm_var": 0.0189361572265625, + "learning_rate": 0.0001, + "loss": 7.7033, + "loss/crossentropy": 2.2991076707839966, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23600934445858002, + "step": 3758 + }, + { + "epoch": 0.235, + "grad_norm": 2.1875, + "grad_norm_var": 0.020589192708333332, + "learning_rate": 0.0001, + "loss": 7.4587, + "loss/crossentropy": 2.1402446627616882, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.21911373734474182, + "step": 3760 + }, + { + "epoch": 0.235125, + "grad_norm": 2.453125, + "grad_norm_var": 0.0215728759765625, + "learning_rate": 0.0001, + "loss": 7.6719, + "loss/crossentropy": 2.234601616859436, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22324847429990768, + "step": 3762 + }, + { + "epoch": 0.23525, + "grad_norm": 2.34375, + "grad_norm_var": 0.022412109375, + "learning_rate": 0.0001, + "loss": 7.5913, + "loss/crossentropy": 2.2850943207740784, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22513452172279358, + "step": 3764 + }, + { + "epoch": 0.235375, + "grad_norm": 2.3125, + "grad_norm_var": 0.01890869140625, + "learning_rate": 0.0001, + "loss": 7.6234, + "loss/crossentropy": 2.122738838195801, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22072789072990417, + "step": 3766 + }, + { + "epoch": 0.2355, + "grad_norm": 2.421875, + "grad_norm_var": 0.0145172119140625, + "learning_rate": 0.0001, + "loss": 7.5619, + "loss/crossentropy": 2.4355037212371826, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.24772146344184875, + "step": 3768 + }, + { + "epoch": 0.235625, + "grad_norm": 2.453125, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 7.5071, + "loss/crossentropy": 1.8801981806755066, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22862768173217773, + "step": 3770 + }, + { + "epoch": 0.23575, + "grad_norm": 2.640625, + "grad_norm_var": 0.017650349934895834, + "learning_rate": 0.0001, + "loss": 7.4754, + "loss/crossentropy": 2.2766274213790894, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2313675582408905, + "step": 3772 + }, + { + "epoch": 0.235875, + "grad_norm": 2.265625, + "grad_norm_var": 0.018610636393229168, + "learning_rate": 0.0001, + "loss": 7.5267, + "loss/crossentropy": 2.0864307284355164, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2242383509874344, + "step": 3774 + }, + { + "epoch": 0.236, + "grad_norm": 2.390625, + "grad_norm_var": 0.01572265625, + "learning_rate": 0.0001, + "loss": 7.4776, + "loss/crossentropy": 2.2763630151748657, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.24077893048524857, + "step": 3776 + }, + { + "epoch": 0.236125, + "grad_norm": 2.375, + "grad_norm_var": 0.014127604166666667, + "learning_rate": 0.0001, + "loss": 7.5641, + "loss/crossentropy": 2.2426193952560425, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2291230633854866, + "step": 3778 + }, + { + "epoch": 0.23625, + "grad_norm": 2.28125, + "grad_norm_var": 0.015558878580729166, + "learning_rate": 0.0001, + "loss": 7.4933, + "loss/crossentropy": 2.081269860267639, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.25176841020584106, + "step": 3780 + }, + { + "epoch": 0.236375, + "grad_norm": 6.09375, + "grad_norm_var": 1.8935546875, + "learning_rate": 0.0001, + "loss": 7.8648, + "loss/crossentropy": 2.011550545692444, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.25546982884407043, + "step": 3782 + }, + { + "epoch": 0.2365, + "grad_norm": 2.59375, + "grad_norm_var": 1.8919993082682292, + "learning_rate": 0.0001, + "loss": 7.9057, + "loss/crossentropy": 2.4982590675354004, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2651009112596512, + "step": 3784 + }, + { + "epoch": 0.236625, + "grad_norm": 2.71875, + "grad_norm_var": 1.8650950113932292, + "learning_rate": 0.0001, + "loss": 7.6646, + "loss/crossentropy": 2.2798246145248413, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22900397330522537, + "step": 3786 + }, + { + "epoch": 0.23675, + "grad_norm": 2.390625, + "grad_norm_var": 1.87945556640625, + "learning_rate": 0.0001, + "loss": 7.379, + "loss/crossentropy": 2.03772509098053, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22353503108024597, + "step": 3788 + }, + { + "epoch": 0.236875, + "grad_norm": 2.734375, + "grad_norm_var": 1.82320556640625, + "learning_rate": 0.0001, + "loss": 7.4055, + "loss/crossentropy": 2.317867159843445, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23167508840560913, + "step": 3790 + }, + { + "epoch": 0.237, + "grad_norm": 2.1875, + "grad_norm_var": 1.84205322265625, + "learning_rate": 0.0001, + "loss": 7.4846, + "loss/crossentropy": 2.010311722755432, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.20548687875270844, + "step": 3792 + }, + { + "epoch": 0.237125, + "grad_norm": 2.359375, + "grad_norm_var": 1.8334625244140625, + "learning_rate": 0.0001, + "loss": 7.5309, + "loss/crossentropy": 2.325111150741577, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2562504708766937, + "step": 3794 + }, + { + "epoch": 0.23725, + "grad_norm": 2.328125, + "grad_norm_var": 1.8178131103515625, + "learning_rate": 0.0001, + "loss": 7.5873, + "loss/crossentropy": 2.0214288234710693, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2158362790942192, + "step": 3796 + }, + { + "epoch": 0.237375, + "grad_norm": 2.390625, + "grad_norm_var": 0.1085845947265625, + "learning_rate": 0.0001, + "loss": 7.6152, + "loss/crossentropy": 2.025280773639679, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2420121431350708, + "step": 3798 + }, + { + "epoch": 0.2375, + "grad_norm": 2.453125, + "grad_norm_var": 0.025788370768229166, + "learning_rate": 0.0001, + "loss": 7.5203, + "loss/crossentropy": 2.2576816082000732, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24750325828790665, + "step": 3800 + }, + { + "epoch": 0.237625, + "grad_norm": 2.515625, + "grad_norm_var": 0.019807942708333335, + "learning_rate": 0.0001, + "loss": 7.562, + "loss/crossentropy": 2.4177106618881226, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2321905866265297, + "step": 3802 + }, + { + "epoch": 0.23775, + "grad_norm": 2.25, + "grad_norm_var": 0.017560831705729165, + "learning_rate": 0.0001, + "loss": 7.6246, + "loss/crossentropy": 2.381041169166565, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22734872996807098, + "step": 3804 + }, + { + "epoch": 0.237875, + "grad_norm": 2.1875, + "grad_norm_var": 0.014777628580729167, + "learning_rate": 0.0001, + "loss": 7.2978, + "loss/crossentropy": 2.169810175895691, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21825361251831055, + "step": 3806 + }, + { + "epoch": 0.238, + "grad_norm": 2.703125, + "grad_norm_var": 0.01949462890625, + "learning_rate": 0.0001, + "loss": 7.6745, + "loss/crossentropy": 2.2910315990448, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2431943118572235, + "step": 3808 + }, + { + "epoch": 0.238125, + "grad_norm": 2.234375, + "grad_norm_var": 0.020799763997395835, + "learning_rate": 0.0001, + "loss": 7.4715, + "loss/crossentropy": 2.32381534576416, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21884576976299286, + "step": 3810 + }, + { + "epoch": 0.23825, + "grad_norm": 2.40625, + "grad_norm_var": 0.02027587890625, + "learning_rate": 0.0001, + "loss": 7.626, + "loss/crossentropy": 2.3181525468826294, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23518574982881546, + "step": 3812 + }, + { + "epoch": 0.238375, + "grad_norm": 2.328125, + "grad_norm_var": 0.021712239583333334, + "learning_rate": 0.0001, + "loss": 7.718, + "loss/crossentropy": 2.4021564722061157, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22601833939552307, + "step": 3814 + }, + { + "epoch": 0.2385, + "grad_norm": 2.671875, + "grad_norm_var": 0.026253255208333333, + "learning_rate": 0.0001, + "loss": 7.5776, + "loss/crossentropy": 2.241186261177063, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22214636206626892, + "step": 3816 + }, + { + "epoch": 0.238625, + "grad_norm": 2.203125, + "grad_norm_var": 0.028076171875, + "learning_rate": 0.0001, + "loss": 7.4295, + "loss/crossentropy": 2.3284627199172974, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.23751358687877655, + "step": 3818 + }, + { + "epoch": 0.23875, + "grad_norm": 2.328125, + "grad_norm_var": 0.025634765625, + "learning_rate": 0.0001, + "loss": 7.3839, + "loss/crossentropy": 1.878059983253479, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.20184766501188278, + "step": 3820 + }, + { + "epoch": 0.238875, + "grad_norm": 2.296875, + "grad_norm_var": 0.021415201822916667, + "learning_rate": 0.0001, + "loss": 7.4835, + "loss/crossentropy": 2.2008787393569946, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2629189044237137, + "step": 3822 + }, + { + "epoch": 0.239, + "grad_norm": 2.5625, + "grad_norm_var": 0.016803995768229166, + "learning_rate": 0.0001, + "loss": 7.4507, + "loss/crossentropy": 2.3513262271881104, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22654324024915695, + "step": 3824 + }, + { + "epoch": 0.239125, + "grad_norm": 2.640625, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 7.4914, + "loss/crossentropy": 2.05319607257843, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21194881200790405, + "step": 3826 + }, + { + "epoch": 0.23925, + "grad_norm": 2.296875, + "grad_norm_var": 0.020882161458333333, + "learning_rate": 0.0001, + "loss": 7.4726, + "loss/crossentropy": 2.274933695793152, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.24475091695785522, + "step": 3828 + }, + { + "epoch": 0.239375, + "grad_norm": 2.625, + "grad_norm_var": 0.02076416015625, + "learning_rate": 0.0001, + "loss": 7.4758, + "loss/crossentropy": 2.2083067893981934, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21219877898693085, + "step": 3830 + }, + { + "epoch": 0.2395, + "grad_norm": 2.5, + "grad_norm_var": 0.0187164306640625, + "learning_rate": 0.0001, + "loss": 7.6724, + "loss/crossentropy": 2.3150436878204346, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2536996081471443, + "step": 3832 + }, + { + "epoch": 0.239625, + "grad_norm": 2.53125, + "grad_norm_var": 0.014557902018229167, + "learning_rate": 0.0001, + "loss": 7.569, + "loss/crossentropy": 2.2580004930496216, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21672644466161728, + "step": 3834 + }, + { + "epoch": 0.23975, + "grad_norm": 2.375, + "grad_norm_var": 0.014354451497395834, + "learning_rate": 0.0001, + "loss": 7.5683, + "loss/crossentropy": 2.080252170562744, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22742826491594315, + "step": 3836 + }, + { + "epoch": 0.239875, + "grad_norm": 2.375, + "grad_norm_var": 0.014774576822916666, + "learning_rate": 0.0001, + "loss": 7.3651, + "loss/crossentropy": 2.1281388998031616, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2401324361562729, + "step": 3838 + }, + { + "epoch": 0.24, + "grad_norm": 2.703125, + "grad_norm_var": 0.03736572265625, + "learning_rate": 0.0001, + "loss": 7.5007, + "loss/crossentropy": 2.2224199771881104, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23487379401922226, + "step": 3840 + }, + { + "epoch": 0.240125, + "grad_norm": 2.421875, + "grad_norm_var": 0.04025065104166667, + "learning_rate": 0.0001, + "loss": 7.632, + "loss/crossentropy": 2.424517512321472, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.24695612490177155, + "step": 3842 + }, + { + "epoch": 0.24025, + "grad_norm": 2.75, + "grad_norm_var": 0.04254557291666667, + "learning_rate": 0.0001, + "loss": 7.5674, + "loss/crossentropy": 2.1599162220954895, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21321508288383484, + "step": 3844 + }, + { + "epoch": 0.240375, + "grad_norm": 2.59375, + "grad_norm_var": 0.0422271728515625, + "learning_rate": 0.0001, + "loss": 7.6312, + "loss/crossentropy": 2.0522512793540955, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.21054793149232864, + "step": 3846 + }, + { + "epoch": 0.2405, + "grad_norm": 2.46875, + "grad_norm_var": 0.04182840983072917, + "learning_rate": 0.0001, + "loss": 7.6148, + "loss/crossentropy": 2.3739218711853027, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.24357128143310547, + "step": 3848 + }, + { + "epoch": 0.240625, + "grad_norm": 2.125, + "grad_norm_var": 0.050309244791666666, + "learning_rate": 0.0001, + "loss": 7.4666, + "loss/crossentropy": 2.1554529666900635, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.21792703121900558, + "step": 3850 + }, + { + "epoch": 0.24075, + "grad_norm": 2.59375, + "grad_norm_var": 0.05252278645833333, + "learning_rate": 0.0001, + "loss": 7.5105, + "loss/crossentropy": 2.5243422985076904, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.23979288339614868, + "step": 3852 + }, + { + "epoch": 0.240875, + "grad_norm": 2.21875, + "grad_norm_var": 0.054032389322916666, + "learning_rate": 0.0001, + "loss": 7.6535, + "loss/crossentropy": 2.169221580028534, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21038363128900528, + "step": 3854 + }, + { + "epoch": 0.241, + "grad_norm": 2.34375, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 7.4283, + "loss/crossentropy": 2.375051498413086, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24313046038150787, + "step": 3856 + }, + { + "epoch": 0.241125, + "grad_norm": 2.453125, + "grad_norm_var": 0.027339680989583334, + "learning_rate": 0.0001, + "loss": 7.5168, + "loss/crossentropy": 2.504861831665039, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2226250097155571, + "step": 3858 + }, + { + "epoch": 0.24125, + "grad_norm": 2.171875, + "grad_norm_var": 0.022347005208333333, + "learning_rate": 0.0001, + "loss": 7.4035, + "loss/crossentropy": 2.188872456550598, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22465746104717255, + "step": 3860 + }, + { + "epoch": 0.241375, + "grad_norm": 2.34375, + "grad_norm_var": 0.018831380208333335, + "learning_rate": 0.0001, + "loss": 7.4832, + "loss/crossentropy": 2.109761595726013, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21876226365566254, + "step": 3862 + }, + { + "epoch": 0.2415, + "grad_norm": 2.34375, + "grad_norm_var": 0.0161041259765625, + "learning_rate": 0.0001, + "loss": 7.4889, + "loss/crossentropy": 2.454153060913086, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2234111726284027, + "step": 3864 + }, + { + "epoch": 0.241625, + "grad_norm": 2.359375, + "grad_norm_var": 0.0175445556640625, + "learning_rate": 0.0001, + "loss": 7.5401, + "loss/crossentropy": 2.2534934282302856, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.20882034301757812, + "step": 3866 + }, + { + "epoch": 0.24175, + "grad_norm": 2.234375, + "grad_norm_var": 0.01861572265625, + "learning_rate": 0.0001, + "loss": 7.3884, + "loss/crossentropy": 2.46646249294281, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23306798189878464, + "step": 3868 + }, + { + "epoch": 0.241875, + "grad_norm": 2.46875, + "grad_norm_var": 0.01939697265625, + "learning_rate": 0.0001, + "loss": 7.4742, + "loss/crossentropy": 2.286616086959839, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23071034252643585, + "step": 3870 + }, + { + "epoch": 0.242, + "grad_norm": 2.203125, + "grad_norm_var": 0.020002237955729165, + "learning_rate": 0.0001, + "loss": 7.4734, + "loss/crossentropy": 2.3127135038375854, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2306494191288948, + "step": 3872 + }, + { + "epoch": 0.242125, + "grad_norm": 2.34375, + "grad_norm_var": 0.018636067708333332, + "learning_rate": 0.0001, + "loss": 7.3801, + "loss/crossentropy": 2.292248845100403, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21103744953870773, + "step": 3874 + }, + { + "epoch": 0.24225, + "grad_norm": 2.53125, + "grad_norm_var": 0.020799763997395835, + "learning_rate": 0.0001, + "loss": 7.7475, + "loss/crossentropy": 2.2768986225128174, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.25374574959278107, + "step": 3876 + }, + { + "epoch": 0.242375, + "grad_norm": 2.640625, + "grad_norm_var": 0.02554931640625, + "learning_rate": 0.0001, + "loss": 7.4706, + "loss/crossentropy": 2.335216999053955, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2206215113401413, + "step": 3878 + }, + { + "epoch": 0.2425, + "grad_norm": 2.203125, + "grad_norm_var": 0.027098592122395834, + "learning_rate": 0.0001, + "loss": 7.6204, + "loss/crossentropy": 2.3161808252334595, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23353615403175354, + "step": 3880 + }, + { + "epoch": 0.242625, + "grad_norm": 2.453125, + "grad_norm_var": 0.024853515625, + "learning_rate": 0.0001, + "loss": 7.4924, + "loss/crossentropy": 2.1703152656555176, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23688945174217224, + "step": 3882 + }, + { + "epoch": 0.24275, + "grad_norm": 2.78125, + "grad_norm_var": 0.027176920572916666, + "learning_rate": 0.0001, + "loss": 7.529, + "loss/crossentropy": 2.342598557472229, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.21841312944889069, + "step": 3884 + }, + { + "epoch": 0.242875, + "grad_norm": 2.640625, + "grad_norm_var": 0.09791259765625, + "learning_rate": 0.0001, + "loss": 7.3372, + "loss/crossentropy": 2.1650543808937073, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.22078751772642136, + "step": 3886 + }, + { + "epoch": 0.243, + "grad_norm": 2.140625, + "grad_norm_var": 0.11746419270833333, + "learning_rate": 0.0001, + "loss": 7.4851, + "loss/crossentropy": 1.997887134552002, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21081900596618652, + "step": 3888 + }, + { + "epoch": 0.243125, + "grad_norm": 2.5, + "grad_norm_var": 0.11201070149739584, + "learning_rate": 0.0001, + "loss": 7.6291, + "loss/crossentropy": 2.2321996688842773, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22545063495635986, + "step": 3890 + }, + { + "epoch": 0.24325, + "grad_norm": 2.578125, + "grad_norm_var": 0.11261393229166666, + "learning_rate": 0.0001, + "loss": 7.5623, + "loss/crossentropy": 2.522809147834778, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24426687508821487, + "step": 3892 + }, + { + "epoch": 0.243375, + "grad_norm": 2.1875, + "grad_norm_var": 0.11669921875, + "learning_rate": 0.0001, + "loss": 7.4586, + "loss/crossentropy": 2.3665112257003784, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.253150999546051, + "step": 3894 + }, + { + "epoch": 0.2435, + "grad_norm": 2.78125, + "grad_norm_var": 0.11687825520833334, + "learning_rate": 0.0001, + "loss": 7.5531, + "loss/crossentropy": 2.4039018154144287, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22742793709039688, + "step": 3896 + }, + { + "epoch": 0.243625, + "grad_norm": 2.125, + "grad_norm_var": 0.1285552978515625, + "learning_rate": 0.0001, + "loss": 7.5957, + "loss/crossentropy": 2.3447670936584473, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2258382812142372, + "step": 3898 + }, + { + "epoch": 0.24375, + "grad_norm": 2.390625, + "grad_norm_var": 0.12775065104166666, + "learning_rate": 0.0001, + "loss": 7.3161, + "loss/crossentropy": 2.020743668079376, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2295529618859291, + "step": 3900 + }, + { + "epoch": 0.243875, + "grad_norm": 2.4375, + "grad_norm_var": 0.045685831705729166, + "learning_rate": 0.0001, + "loss": 7.4793, + "loss/crossentropy": 2.0840115547180176, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21463429182767868, + "step": 3902 + }, + { + "epoch": 0.244, + "grad_norm": 2.671875, + "grad_norm_var": 0.03689778645833333, + "learning_rate": 0.0001, + "loss": 7.7833, + "loss/crossentropy": 2.3722068071365356, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.22333616763353348, + "step": 3904 + }, + { + "epoch": 0.244125, + "grad_norm": 2.40625, + "grad_norm_var": 0.038605753580729166, + "learning_rate": 0.0001, + "loss": 7.4049, + "loss/crossentropy": 2.5307204723358154, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.23095162957906723, + "step": 3906 + }, + { + "epoch": 0.24425, + "grad_norm": 2.34375, + "grad_norm_var": 0.036554972330729164, + "learning_rate": 0.0001, + "loss": 7.5943, + "loss/crossentropy": 2.427748680114746, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2420305386185646, + "step": 3908 + }, + { + "epoch": 0.244375, + "grad_norm": 2.375, + "grad_norm_var": 0.033984375, + "learning_rate": 0.0001, + "loss": 7.5523, + "loss/crossentropy": 2.3058314323425293, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.25578539073467255, + "step": 3910 + }, + { + "epoch": 0.2445, + "grad_norm": 2.28125, + "grad_norm_var": 0.0274566650390625, + "learning_rate": 0.0001, + "loss": 7.2921, + "loss/crossentropy": 1.9724953174591064, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.19885031133890152, + "step": 3912 + }, + { + "epoch": 0.244625, + "grad_norm": 2.390625, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 7.6238, + "loss/crossentropy": 2.1780654191970825, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2180589661002159, + "step": 3914 + }, + { + "epoch": 0.24475, + "grad_norm": 2.375, + "grad_norm_var": 0.017455037434895834, + "learning_rate": 0.0001, + "loss": 7.4598, + "loss/crossentropy": 2.246911883354187, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22257865220308304, + "step": 3916 + }, + { + "epoch": 0.244875, + "grad_norm": 2.484375, + "grad_norm_var": 0.01953125, + "learning_rate": 0.0001, + "loss": 7.5233, + "loss/crossentropy": 2.1807644367218018, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23529939353466034, + "step": 3918 + }, + { + "epoch": 0.245, + "grad_norm": 2.421875, + "grad_norm_var": 0.015241495768229167, + "learning_rate": 0.0001, + "loss": 7.6941, + "loss/crossentropy": 2.4168232679367065, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.24450047314167023, + "step": 3920 + }, + { + "epoch": 0.245125, + "grad_norm": 2.640625, + "grad_norm_var": 0.015836588541666665, + "learning_rate": 0.0001, + "loss": 7.3875, + "loss/crossentropy": 2.1456319093704224, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22022899985313416, + "step": 3922 + }, + { + "epoch": 0.24525, + "grad_norm": 2.46875, + "grad_norm_var": 0.017183430989583335, + "learning_rate": 0.0001, + "loss": 7.5178, + "loss/crossentropy": 2.1945712566375732, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23692822456359863, + "step": 3924 + }, + { + "epoch": 0.245375, + "grad_norm": 2.25, + "grad_norm_var": 0.017943318684895834, + "learning_rate": 0.0001, + "loss": 7.5715, + "loss/crossentropy": 2.334246516227722, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.24413339793682098, + "step": 3926 + }, + { + "epoch": 0.2455, + "grad_norm": 2.40625, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 7.4425, + "loss/crossentropy": 2.052259385585785, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21555544435977936, + "step": 3928 + }, + { + "epoch": 0.245625, + "grad_norm": 2.46875, + "grad_norm_var": 0.0144195556640625, + "learning_rate": 0.0001, + "loss": 7.6389, + "loss/crossentropy": 2.4917489290237427, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24890758097171783, + "step": 3930 + }, + { + "epoch": 0.24575, + "grad_norm": 2.359375, + "grad_norm_var": 0.015250651041666667, + "learning_rate": 0.0001, + "loss": 7.5094, + "loss/crossentropy": 2.3055362701416016, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2225131392478943, + "step": 3932 + }, + { + "epoch": 0.245875, + "grad_norm": 2.40625, + "grad_norm_var": 0.013472493489583333, + "learning_rate": 0.0001, + "loss": 7.4905, + "loss/crossentropy": 2.1624478101730347, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2125004380941391, + "step": 3934 + }, + { + "epoch": 0.246, + "grad_norm": 2.328125, + "grad_norm_var": 0.013997395833333334, + "learning_rate": 0.0001, + "loss": 7.4723, + "loss/crossentropy": 2.408942222595215, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.23912448436021805, + "step": 3936 + }, + { + "epoch": 0.246125, + "grad_norm": 2.234375, + "grad_norm_var": 0.012760416666666666, + "learning_rate": 0.0001, + "loss": 7.2563, + "loss/crossentropy": 2.1905765533447266, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22357414662837982, + "step": 3938 + }, + { + "epoch": 0.24625, + "grad_norm": 2.640625, + "grad_norm_var": 0.0129547119140625, + "learning_rate": 0.0001, + "loss": 7.4517, + "loss/crossentropy": 2.3154603242874146, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23004180938005447, + "step": 3940 + }, + { + "epoch": 0.246375, + "grad_norm": 2.3125, + "grad_norm_var": 0.011620076497395833, + "learning_rate": 0.0001, + "loss": 7.4689, + "loss/crossentropy": 2.5023341178894043, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24651438742876053, + "step": 3942 + }, + { + "epoch": 0.2465, + "grad_norm": 2.34375, + "grad_norm_var": 0.01148681640625, + "learning_rate": 0.0001, + "loss": 7.4688, + "loss/crossentropy": 2.0356882214546204, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.20058569312095642, + "step": 3944 + }, + { + "epoch": 0.246625, + "grad_norm": 2.5, + "grad_norm_var": 0.010791015625, + "learning_rate": 0.0001, + "loss": 7.5573, + "loss/crossentropy": 2.4191837310791016, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23377195000648499, + "step": 3946 + }, + { + "epoch": 0.24675, + "grad_norm": 2.5, + "grad_norm_var": 0.010628255208333333, + "learning_rate": 0.0001, + "loss": 7.6407, + "loss/crossentropy": 2.3417128324508667, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.22980307787656784, + "step": 3948 + }, + { + "epoch": 0.246875, + "grad_norm": 2.5, + "grad_norm_var": 0.014143880208333333, + "learning_rate": 0.0001, + "loss": 7.5853, + "loss/crossentropy": 2.3083345890045166, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23690129816532135, + "step": 3950 + }, + { + "epoch": 0.247, + "grad_norm": 2.34375, + "grad_norm_var": 0.017389933268229168, + "learning_rate": 0.0001, + "loss": 7.5956, + "loss/crossentropy": 2.456955909729004, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2354580983519554, + "step": 3952 + }, + { + "epoch": 0.247125, + "grad_norm": 2.359375, + "grad_norm_var": 0.015055338541666666, + "learning_rate": 0.0001, + "loss": 7.4467, + "loss/crossentropy": 2.162124276161194, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23025665432214737, + "step": 3954 + }, + { + "epoch": 0.24725, + "grad_norm": 2.265625, + "grad_norm_var": 0.013337198893229167, + "learning_rate": 0.0001, + "loss": 7.4083, + "loss/crossentropy": 2.203832507133484, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21881967037916183, + "step": 3956 + }, + { + "epoch": 0.247375, + "grad_norm": 2.453125, + "grad_norm_var": 0.012848917643229167, + "learning_rate": 0.0001, + "loss": 7.4759, + "loss/crossentropy": 2.226723313331604, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22026114910840988, + "step": 3958 + }, + { + "epoch": 0.2475, + "grad_norm": 2.421875, + "grad_norm_var": 0.012678019205729167, + "learning_rate": 0.0001, + "loss": 7.476, + "loss/crossentropy": 2.3727807998657227, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.247600257396698, + "step": 3960 + }, + { + "epoch": 0.247625, + "grad_norm": 2.375, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 7.483, + "loss/crossentropy": 2.2924267053604126, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2366592362523079, + "step": 3962 + }, + { + "epoch": 0.24775, + "grad_norm": 2.21875, + "grad_norm_var": 0.013654581705729167, + "learning_rate": 0.0001, + "loss": 7.3951, + "loss/crossentropy": 2.0975323915481567, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2243112102150917, + "step": 3964 + }, + { + "epoch": 0.247875, + "grad_norm": 2.53125, + "grad_norm_var": 0.010553995768229166, + "learning_rate": 0.0001, + "loss": 7.3489, + "loss/crossentropy": 2.21097195148468, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2190333753824234, + "step": 3966 + }, + { + "epoch": 0.248, + "grad_norm": 2.453125, + "grad_norm_var": 0.007840983072916667, + "learning_rate": 0.0001, + "loss": 7.4721, + "loss/crossentropy": 2.2507461309432983, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23200294375419617, + "step": 3968 + }, + { + "epoch": 0.248125, + "grad_norm": 2.234375, + "grad_norm_var": 0.0075592041015625, + "learning_rate": 0.0001, + "loss": 7.4843, + "loss/crossentropy": 2.191147208213806, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23013149946928024, + "step": 3970 + }, + { + "epoch": 0.24825, + "grad_norm": 2.453125, + "grad_norm_var": 0.00748291015625, + "learning_rate": 0.0001, + "loss": 7.5614, + "loss/crossentropy": 2.4555280208587646, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2434312105178833, + "step": 3972 + }, + { + "epoch": 0.248375, + "grad_norm": 2.375, + "grad_norm_var": 0.007157389322916667, + "learning_rate": 0.0001, + "loss": 7.4256, + "loss/crossentropy": 2.2261093854904175, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23195409029722214, + "step": 3974 + }, + { + "epoch": 0.2485, + "grad_norm": 2.59375, + "grad_norm_var": 0.009528605143229167, + "learning_rate": 0.0001, + "loss": 7.4072, + "loss/crossentropy": 2.1353044509887695, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21157176792621613, + "step": 3976 + }, + { + "epoch": 0.248625, + "grad_norm": 2.203125, + "grad_norm_var": 0.0131988525390625, + "learning_rate": 0.0001, + "loss": 7.3719, + "loss/crossentropy": 2.145276427268982, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21788546442985535, + "step": 3978 + }, + { + "epoch": 0.24875, + "grad_norm": 2.484375, + "grad_norm_var": 0.0127105712890625, + "learning_rate": 0.0001, + "loss": 7.4473, + "loss/crossentropy": 2.37498140335083, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21876709163188934, + "step": 3980 + }, + { + "epoch": 0.248875, + "grad_norm": 2.578125, + "grad_norm_var": 0.01724853515625, + "learning_rate": 0.0001, + "loss": 7.589, + "loss/crossentropy": 2.432854413986206, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2542262375354767, + "step": 3982 + }, + { + "epoch": 0.249, + "grad_norm": 2.21875, + "grad_norm_var": 0.0197418212890625, + "learning_rate": 0.0001, + "loss": 7.5591, + "loss/crossentropy": 2.451367735862732, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22828736156225204, + "step": 3984 + }, + { + "epoch": 0.249125, + "grad_norm": 2.375, + "grad_norm_var": 0.0188629150390625, + "learning_rate": 0.0001, + "loss": 7.4799, + "loss/crossentropy": 2.2237168550491333, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.23009125888347626, + "step": 3986 + }, + { + "epoch": 0.24925, + "grad_norm": 2.25, + "grad_norm_var": 0.020703125, + "learning_rate": 0.0001, + "loss": 7.6225, + "loss/crossentropy": 2.0535144805908203, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.22887953370809555, + "step": 3988 + }, + { + "epoch": 0.249375, + "grad_norm": 2.578125, + "grad_norm_var": 0.0243804931640625, + "learning_rate": 0.0001, + "loss": 7.2587, + "loss/crossentropy": 2.1713778972625732, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.20236970484256744, + "step": 3990 + }, + { + "epoch": 0.2495, + "grad_norm": 2.40625, + "grad_norm_var": 0.029539998372395834, + "learning_rate": 0.0001, + "loss": 7.5803, + "loss/crossentropy": 2.314449429512024, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2168670818209648, + "step": 3992 + }, + { + "epoch": 0.249625, + "grad_norm": 2.28125, + "grad_norm_var": 0.0293121337890625, + "learning_rate": 0.0001, + "loss": 7.5327, + "loss/crossentropy": 2.2450716495513916, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23649980127811432, + "step": 3994 + }, + { + "epoch": 0.24975, + "grad_norm": 2.28125, + "grad_norm_var": 0.029488118489583333, + "learning_rate": 0.0001, + "loss": 7.3866, + "loss/crossentropy": 2.172006130218506, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21691139042377472, + "step": 3996 + }, + { + "epoch": 0.249875, + "grad_norm": 2.46875, + "grad_norm_var": 0.025321451822916667, + "learning_rate": 0.0001, + "loss": 7.4429, + "loss/crossentropy": 2.230931878089905, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2391601949930191, + "step": 3998 + }, + { + "epoch": 0.25, + "grad_norm": 2.703125, + "grad_norm_var": 0.0368316650390625, + "learning_rate": 0.0001, + "loss": 7.4992, + "loss/crossentropy": 2.2851897478103638, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219674438238144, + "step": 4000 + }, + { + "epoch": 0.250125, + "grad_norm": 2.4375, + "grad_norm_var": 0.03642578125, + "learning_rate": 0.0001, + "loss": 7.6042, + "loss/crossentropy": 2.118333578109741, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.23840918391942978, + "step": 4002 + }, + { + "epoch": 0.25025, + "grad_norm": 2.375, + "grad_norm_var": 0.03258056640625, + "learning_rate": 0.0001, + "loss": 7.5683, + "loss/crossentropy": 2.031722903251648, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.21818936616182327, + "step": 4004 + }, + { + "epoch": 0.250375, + "grad_norm": 2.234375, + "grad_norm_var": 0.03168843587239583, + "learning_rate": 0.0001, + "loss": 7.4845, + "loss/crossentropy": 2.4262243509292603, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24300269782543182, + "step": 4006 + }, + { + "epoch": 0.2505, + "grad_norm": 2.4375, + "grad_norm_var": 0.024299112955729167, + "learning_rate": 0.0001, + "loss": 7.7033, + "loss/crossentropy": 2.167011022567749, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.23807096481323242, + "step": 4008 + }, + { + "epoch": 0.250625, + "grad_norm": 2.421875, + "grad_norm_var": 0.02398681640625, + "learning_rate": 0.0001, + "loss": 7.532, + "loss/crossentropy": 2.434093475341797, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22854039818048477, + "step": 4010 + }, + { + "epoch": 0.25075, + "grad_norm": 2.109375, + "grad_norm_var": 0.030745442708333334, + "learning_rate": 0.0001, + "loss": 7.3823, + "loss/crossentropy": 2.121519684791565, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22766801714897156, + "step": 4012 + }, + { + "epoch": 0.250875, + "grad_norm": 2.34375, + "grad_norm_var": 0.027880859375, + "learning_rate": 0.0001, + "loss": 7.4289, + "loss/crossentropy": 2.2950530648231506, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24578213691711426, + "step": 4014 + }, + { + "epoch": 0.251, + "grad_norm": 2.671875, + "grad_norm_var": 0.020589192708333332, + "learning_rate": 0.0001, + "loss": 7.5219, + "loss/crossentropy": 2.3839573860168457, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21992743760347366, + "step": 4016 + }, + { + "epoch": 0.251125, + "grad_norm": 2.296875, + "grad_norm_var": 0.022321573893229165, + "learning_rate": 0.0001, + "loss": 7.5561, + "loss/crossentropy": 2.4837042093276978, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23451413959264755, + "step": 4018 + }, + { + "epoch": 0.25125, + "grad_norm": 2.4375, + "grad_norm_var": 0.0230133056640625, + "learning_rate": 0.0001, + "loss": 7.5998, + "loss/crossentropy": 2.4655433893203735, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2359592318534851, + "step": 4020 + }, + { + "epoch": 0.251375, + "grad_norm": 2.375, + "grad_norm_var": 0.0247711181640625, + "learning_rate": 0.0001, + "loss": 7.3372, + "loss/crossentropy": 2.121498227119446, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2136445865035057, + "step": 4022 + }, + { + "epoch": 0.2515, + "grad_norm": 2.859375, + "grad_norm_var": 0.039549763997395834, + "learning_rate": 0.0001, + "loss": 7.4977, + "loss/crossentropy": 2.208828091621399, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22217834740877151, + "step": 4024 + }, + { + "epoch": 0.251625, + "grad_norm": 2.265625, + "grad_norm_var": 0.06523030598958333, + "learning_rate": 0.0001, + "loss": 7.6357, + "loss/crossentropy": 2.191303014755249, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.25364626944065094, + "step": 4026 + }, + { + "epoch": 0.25175, + "grad_norm": 2.34375, + "grad_norm_var": 0.06326497395833333, + "learning_rate": 0.0001, + "loss": 7.4891, + "loss/crossentropy": 2.271737813949585, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.23410511761903763, + "step": 4028 + }, + { + "epoch": 0.251875, + "grad_norm": 2.453125, + "grad_norm_var": 0.06123758951822917, + "learning_rate": 0.0001, + "loss": 7.5977, + "loss/crossentropy": 2.153814435005188, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21588297933340073, + "step": 4030 + }, + { + "epoch": 0.252, + "grad_norm": 2.421875, + "grad_norm_var": 0.0592437744140625, + "learning_rate": 0.0001, + "loss": 7.2691, + "loss/crossentropy": 2.183136820793152, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22347302734851837, + "step": 4032 + }, + { + "epoch": 0.252125, + "grad_norm": 2.25, + "grad_norm_var": 0.060205078125, + "learning_rate": 0.0001, + "loss": 7.481, + "loss/crossentropy": 2.1289315223693848, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2259645164012909, + "step": 4034 + }, + { + "epoch": 0.25225, + "grad_norm": 2.4375, + "grad_norm_var": 0.0599517822265625, + "learning_rate": 0.0001, + "loss": 7.5806, + "loss/crossentropy": 1.9534605145454407, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21411054581403732, + "step": 4036 + }, + { + "epoch": 0.252375, + "grad_norm": 2.375, + "grad_norm_var": 0.06717122395833333, + "learning_rate": 0.0001, + "loss": 7.6651, + "loss/crossentropy": 2.3216229677200317, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.21676085889339447, + "step": 4038 + }, + { + "epoch": 0.2525, + "grad_norm": 2.296875, + "grad_norm_var": 0.058470662434895834, + "learning_rate": 0.0001, + "loss": 7.5468, + "loss/crossentropy": 2.086758255958557, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20101846754550934, + "step": 4040 + }, + { + "epoch": 0.252625, + "grad_norm": 2.28125, + "grad_norm_var": 0.03208719889322917, + "learning_rate": 0.0001, + "loss": 7.3466, + "loss/crossentropy": 2.2384684085845947, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2282901331782341, + "step": 4042 + }, + { + "epoch": 0.25275, + "grad_norm": 2.53125, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 7.4619, + "loss/crossentropy": 1.9721214771270752, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21432989835739136, + "step": 4044 + }, + { + "epoch": 0.252875, + "grad_norm": 2.34375, + "grad_norm_var": 0.029313151041666666, + "learning_rate": 0.0001, + "loss": 7.3318, + "loss/crossentropy": 1.9340474605560303, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21380113065242767, + "step": 4046 + }, + { + "epoch": 0.253, + "grad_norm": 2.21875, + "grad_norm_var": 0.0304351806640625, + "learning_rate": 0.0001, + "loss": 7.5475, + "loss/crossentropy": 2.3662819862365723, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21932249516248703, + "step": 4048 + }, + { + "epoch": 0.253125, + "grad_norm": 2.515625, + "grad_norm_var": 0.030940755208333334, + "learning_rate": 0.0001, + "loss": 7.5843, + "loss/crossentropy": 2.2904332876205444, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2326778918504715, + "step": 4050 + }, + { + "epoch": 0.25325, + "grad_norm": 2.234375, + "grad_norm_var": 0.032421875, + "learning_rate": 0.0001, + "loss": 7.4797, + "loss/crossentropy": 2.4156243801116943, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23186896741390228, + "step": 4052 + }, + { + "epoch": 0.253375, + "grad_norm": 2.40625, + "grad_norm_var": 0.018408203125, + "learning_rate": 0.0001, + "loss": 7.4728, + "loss/crossentropy": 2.2443253993988037, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21730951219797134, + "step": 4054 + }, + { + "epoch": 0.2535, + "grad_norm": 2.359375, + "grad_norm_var": 0.019359334309895834, + "learning_rate": 0.0001, + "loss": 7.4368, + "loss/crossentropy": 1.9812930226325989, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2251596599817276, + "step": 4056 + }, + { + "epoch": 0.253625, + "grad_norm": 2.171875, + "grad_norm_var": 0.020441691080729168, + "learning_rate": 0.0001, + "loss": 7.4598, + "loss/crossentropy": 2.628469467163086, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23521346598863602, + "step": 4058 + }, + { + "epoch": 0.25375, + "grad_norm": 2.4375, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 7.5237, + "loss/crossentropy": 2.390172600746155, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22036952525377274, + "step": 4060 + }, + { + "epoch": 0.253875, + "grad_norm": 2.46875, + "grad_norm_var": 0.019303385416666666, + "learning_rate": 0.0001, + "loss": 7.632, + "loss/crossentropy": 2.174973249435425, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22056061774492264, + "step": 4062 + }, + { + "epoch": 0.254, + "grad_norm": 2.390625, + "grad_norm_var": 0.020601399739583335, + "learning_rate": 0.0001, + "loss": 7.5629, + "loss/crossentropy": 2.2380123138427734, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2207566350698471, + "step": 4064 + }, + { + "epoch": 0.254125, + "grad_norm": 2.359375, + "grad_norm_var": 0.017821248372395834, + "learning_rate": 0.0001, + "loss": 7.4799, + "loss/crossentropy": 2.308589458465576, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.22748146951198578, + "step": 4066 + }, + { + "epoch": 0.25425, + "grad_norm": 2.578125, + "grad_norm_var": 0.0190826416015625, + "learning_rate": 0.0001, + "loss": 7.6862, + "loss/crossentropy": 2.6882801055908203, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2547220140695572, + "step": 4068 + }, + { + "epoch": 0.254375, + "grad_norm": 2.15625, + "grad_norm_var": 0.0201324462890625, + "learning_rate": 0.0001, + "loss": 7.5749, + "loss/crossentropy": 2.417618155479431, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.23678294569253922, + "step": 4070 + }, + { + "epoch": 0.2545, + "grad_norm": 2.609375, + "grad_norm_var": 0.0204986572265625, + "learning_rate": 0.0001, + "loss": 7.356, + "loss/crossentropy": 2.23097562789917, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2246166616678238, + "step": 4072 + }, + { + "epoch": 0.254625, + "grad_norm": 2.359375, + "grad_norm_var": 0.02060546875, + "learning_rate": 0.0001, + "loss": 7.6507, + "loss/crossentropy": 2.252380132675171, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23461927473545074, + "step": 4074 + }, + { + "epoch": 0.25475, + "grad_norm": 2.453125, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 7.5349, + "loss/crossentropy": 2.2581721544265747, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2264912948012352, + "step": 4076 + }, + { + "epoch": 0.254875, + "grad_norm": 2.6875, + "grad_norm_var": 0.022347005208333333, + "learning_rate": 0.0001, + "loss": 7.4877, + "loss/crossentropy": 2.3790767192840576, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.23550712317228317, + "step": 4078 + }, + { + "epoch": 0.255, + "grad_norm": 2.34375, + "grad_norm_var": 0.022248331705729166, + "learning_rate": 0.0001, + "loss": 7.4732, + "loss/crossentropy": 2.3831344842910767, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23731465637683868, + "step": 4080 + }, + { + "epoch": 0.255125, + "grad_norm": 2.359375, + "grad_norm_var": 0.026178995768229168, + "learning_rate": 0.0001, + "loss": 7.4923, + "loss/crossentropy": 2.5071096420288086, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22199269384145737, + "step": 4082 + }, + { + "epoch": 0.25525, + "grad_norm": 2.46875, + "grad_norm_var": 0.023758951822916666, + "learning_rate": 0.0001, + "loss": 7.5247, + "loss/crossentropy": 2.30050528049469, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22400212287902832, + "step": 4084 + }, + { + "epoch": 0.255375, + "grad_norm": 2.203125, + "grad_norm_var": 0.0220123291015625, + "learning_rate": 0.0001, + "loss": 7.4624, + "loss/crossentropy": 2.2171316146850586, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21539007127285004, + "step": 4086 + }, + { + "epoch": 0.2555, + "grad_norm": 2.34375, + "grad_norm_var": 0.023493448893229168, + "learning_rate": 0.0001, + "loss": 7.5157, + "loss/crossentropy": 2.311915159225464, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2283788025379181, + "step": 4088 + }, + { + "epoch": 0.255625, + "grad_norm": 2.265625, + "grad_norm_var": 0.017997233072916667, + "learning_rate": 0.0001, + "loss": 7.3652, + "loss/crossentropy": 2.3196401596069336, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.22029782831668854, + "step": 4090 + }, + { + "epoch": 0.25575, + "grad_norm": 2.265625, + "grad_norm_var": 0.01959228515625, + "learning_rate": 0.0001, + "loss": 7.3629, + "loss/crossentropy": 2.2703882455825806, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21397818624973297, + "step": 4092 + }, + { + "epoch": 0.255875, + "grad_norm": 2.390625, + "grad_norm_var": 0.0088531494140625, + "learning_rate": 0.0001, + "loss": 7.4362, + "loss/crossentropy": 2.296531915664673, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21631457656621933, + "step": 4094 + }, + { + "epoch": 0.256, + "grad_norm": 2.1875, + "grad_norm_var": 0.010481770833333333, + "learning_rate": 0.0001, + "loss": 7.5304, + "loss/crossentropy": 2.178624987602234, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22841021418571472, + "step": 4096 + }, + { + "epoch": 0.256125, + "grad_norm": 2.359375, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 7.4817, + "loss/crossentropy": 2.3095529079437256, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22539246827363968, + "step": 4098 + }, + { + "epoch": 0.25625, + "grad_norm": 2.1875, + "grad_norm_var": 0.009618123372395834, + "learning_rate": 0.0001, + "loss": 7.4224, + "loss/crossentropy": 2.3652232885360718, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.24415633082389832, + "step": 4100 + }, + { + "epoch": 0.256375, + "grad_norm": 2.546875, + "grad_norm_var": 0.011083984375, + "learning_rate": 0.0001, + "loss": 7.4878, + "loss/crossentropy": 2.0212921500205994, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22881153225898743, + "step": 4102 + }, + { + "epoch": 0.2565, + "grad_norm": 2.484375, + "grad_norm_var": 0.01650390625, + "learning_rate": 0.0001, + "loss": 7.4785, + "loss/crossentropy": 2.3224822282791138, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23468727618455887, + "step": 4104 + }, + { + "epoch": 0.256625, + "grad_norm": 2.609375, + "grad_norm_var": 0.019364420572916666, + "learning_rate": 0.0001, + "loss": 7.7486, + "loss/crossentropy": 2.4106584787368774, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21804316341876984, + "step": 4106 + }, + { + "epoch": 0.25675, + "grad_norm": 2.1875, + "grad_norm_var": 0.0199371337890625, + "learning_rate": 0.0001, + "loss": 7.321, + "loss/crossentropy": 2.2287791967391968, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22060447931289673, + "step": 4108 + }, + { + "epoch": 0.256875, + "grad_norm": 2.34375, + "grad_norm_var": 0.0247955322265625, + "learning_rate": 0.0001, + "loss": 7.4268, + "loss/crossentropy": 1.9771644473075867, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20855499804019928, + "step": 4110 + }, + { + "epoch": 0.257, + "grad_norm": 2.46875, + "grad_norm_var": 0.0250396728515625, + "learning_rate": 0.0001, + "loss": 7.4497, + "loss/crossentropy": 2.125575006008148, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2247249186038971, + "step": 4112 + }, + { + "epoch": 0.257125, + "grad_norm": 2.640625, + "grad_norm_var": 0.028153483072916666, + "learning_rate": 0.0001, + "loss": 7.4864, + "loss/crossentropy": 2.074004888534546, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21001387387514114, + "step": 4114 + }, + { + "epoch": 0.25725, + "grad_norm": 2.21875, + "grad_norm_var": 0.027367146809895833, + "learning_rate": 0.0001, + "loss": 7.4488, + "loss/crossentropy": 1.9355474710464478, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.20440368354320526, + "step": 4116 + }, + { + "epoch": 0.257375, + "grad_norm": 2.359375, + "grad_norm_var": 0.025386555989583334, + "learning_rate": 0.0001, + "loss": 7.4847, + "loss/crossentropy": 2.311020255088806, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.20823492854833603, + "step": 4118 + }, + { + "epoch": 0.2575, + "grad_norm": 2.234375, + "grad_norm_var": 0.023363240559895835, + "learning_rate": 0.0001, + "loss": 7.4348, + "loss/crossentropy": 2.3363587856292725, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22158727049827576, + "step": 4120 + }, + { + "epoch": 0.257625, + "grad_norm": 3.09375, + "grad_norm_var": 1.232933553059896, + "learning_rate": 0.0001, + "loss": 7.5643, + "loss/crossentropy": 2.253212571144104, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2316179946064949, + "step": 4122 + }, + { + "epoch": 0.25775, + "grad_norm": 2.328125, + "grad_norm_var": 1.2166900634765625, + "learning_rate": 0.0001, + "loss": 7.5682, + "loss/crossentropy": 2.22301983833313, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23405539989471436, + "step": 4124 + }, + { + "epoch": 0.257875, + "grad_norm": 2.625, + "grad_norm_var": 1.179638671875, + "learning_rate": 0.0001, + "loss": 7.6354, + "loss/crossentropy": 2.1657320261001587, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.24702759087085724, + "step": 4126 + }, + { + "epoch": 0.258, + "grad_norm": 2.265625, + "grad_norm_var": 1.1891886393229167, + "learning_rate": 0.0001, + "loss": 7.6633, + "loss/crossentropy": 2.708492875099182, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.248228058218956, + "step": 4128 + }, + { + "epoch": 0.258125, + "grad_norm": 2.4375, + "grad_norm_var": 1.183177693684896, + "learning_rate": 0.0001, + "loss": 7.567, + "loss/crossentropy": 2.481553316116333, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.24093511700630188, + "step": 4130 + }, + { + "epoch": 0.25825, + "grad_norm": 2.25, + "grad_norm_var": 1.1793609619140626, + "learning_rate": 0.0001, + "loss": 7.5228, + "loss/crossentropy": 2.336062788963318, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22933385521173477, + "step": 4132 + }, + { + "epoch": 0.258375, + "grad_norm": 2.59375, + "grad_norm_var": 1.1680338541666666, + "learning_rate": 0.0001, + "loss": 7.4909, + "loss/crossentropy": 2.233590006828308, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22607532143592834, + "step": 4134 + }, + { + "epoch": 0.2585, + "grad_norm": 2.5, + "grad_norm_var": 1.15947265625, + "learning_rate": 0.0001, + "loss": 7.6554, + "loss/crossentropy": 2.3016180992126465, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21121040731668472, + "step": 4136 + }, + { + "epoch": 0.258625, + "grad_norm": 2.40625, + "grad_norm_var": 0.026317342122395834, + "learning_rate": 0.0001, + "loss": 7.551, + "loss/crossentropy": 2.0917162895202637, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2196909263730049, + "step": 4138 + }, + { + "epoch": 0.25875, + "grad_norm": 2.265625, + "grad_norm_var": 0.025862630208333334, + "learning_rate": 0.0001, + "loss": 7.5891, + "loss/crossentropy": 2.369123935699463, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2268521413207054, + "step": 4140 + }, + { + "epoch": 0.258875, + "grad_norm": 2.546875, + "grad_norm_var": 0.016141764322916665, + "learning_rate": 0.0001, + "loss": 7.4561, + "loss/crossentropy": 2.4183573722839355, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.24558168649673462, + "step": 4142 + }, + { + "epoch": 0.259, + "grad_norm": 2.359375, + "grad_norm_var": 0.012906901041666667, + "learning_rate": 0.0001, + "loss": 7.5225, + "loss/crossentropy": 2.1985327005386353, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2282828763127327, + "step": 4144 + }, + { + "epoch": 0.259125, + "grad_norm": 2.28125, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 7.4406, + "loss/crossentropy": 2.2581781148910522, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2538040652871132, + "step": 4146 + }, + { + "epoch": 0.25925, + "grad_norm": 2.703125, + "grad_norm_var": 0.01461181640625, + "learning_rate": 0.0001, + "loss": 7.562, + "loss/crossentropy": 2.539314389228821, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23499274253845215, + "step": 4148 + }, + { + "epoch": 0.259375, + "grad_norm": 2.28125, + "grad_norm_var": 0.013948567708333333, + "learning_rate": 0.0001, + "loss": 7.5071, + "loss/crossentropy": 2.0835620164871216, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22590668499469757, + "step": 4150 + }, + { + "epoch": 0.2595, + "grad_norm": 2.375, + "grad_norm_var": 0.013509114583333334, + "learning_rate": 0.0001, + "loss": 7.4878, + "loss/crossentropy": 2.1896191835403442, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22646436095237732, + "step": 4152 + }, + { + "epoch": 0.259625, + "grad_norm": 2.40625, + "grad_norm_var": 0.01422119140625, + "learning_rate": 0.0001, + "loss": 7.3402, + "loss/crossentropy": 1.9305825233459473, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.20503179728984833, + "step": 4154 + }, + { + "epoch": 0.25975, + "grad_norm": 2.171875, + "grad_norm_var": 0.0172515869140625, + "learning_rate": 0.0001, + "loss": 7.417, + "loss/crossentropy": 1.887523353099823, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2075207680463791, + "step": 4156 + }, + { + "epoch": 0.259875, + "grad_norm": 2.5625, + "grad_norm_var": 0.017756144205729168, + "learning_rate": 0.0001, + "loss": 7.5039, + "loss/crossentropy": 2.354191780090332, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23857767134904861, + "step": 4158 + }, + { + "epoch": 0.26, + "grad_norm": 2.21875, + "grad_norm_var": 0.019873046875, + "learning_rate": 0.0001, + "loss": 7.3457, + "loss/crossentropy": 2.1631126403808594, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.208695188164711, + "step": 4160 + }, + { + "epoch": 0.260125, + "grad_norm": 2.3125, + "grad_norm_var": 0.0183990478515625, + "learning_rate": 0.0001, + "loss": 7.337, + "loss/crossentropy": 2.1133294701576233, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21781667321920395, + "step": 4162 + }, + { + "epoch": 0.26025, + "grad_norm": 2.109375, + "grad_norm_var": 0.014891560872395833, + "learning_rate": 0.0001, + "loss": 7.3932, + "loss/crossentropy": 2.209348678588867, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21353087574243546, + "step": 4164 + }, + { + "epoch": 0.260375, + "grad_norm": 2.5625, + "grad_norm_var": 0.018000284830729168, + "learning_rate": 0.0001, + "loss": 7.5817, + "loss/crossentropy": 2.3300869464874268, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23223827034235, + "step": 4166 + }, + { + "epoch": 0.2605, + "grad_norm": 2.265625, + "grad_norm_var": 0.017829386393229167, + "learning_rate": 0.0001, + "loss": 7.4924, + "loss/crossentropy": 2.2505780458450317, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21991939842700958, + "step": 4168 + }, + { + "epoch": 0.260625, + "grad_norm": 2.265625, + "grad_norm_var": 0.017936197916666667, + "learning_rate": 0.0001, + "loss": 7.4621, + "loss/crossentropy": 2.1267285346984863, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.20734632015228271, + "step": 4170 + }, + { + "epoch": 0.26075, + "grad_norm": 2.859375, + "grad_norm_var": 0.03319905598958333, + "learning_rate": 0.0001, + "loss": 7.6821, + "loss/crossentropy": 2.561190962791443, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.25469981133937836, + "step": 4172 + }, + { + "epoch": 0.260875, + "grad_norm": 2.296875, + "grad_norm_var": 0.03173421223958333, + "learning_rate": 0.0001, + "loss": 7.6492, + "loss/crossentropy": 2.5840975046157837, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24661777913570404, + "step": 4174 + }, + { + "epoch": 0.261, + "grad_norm": 2.5625, + "grad_norm_var": 0.032027180989583334, + "learning_rate": 0.0001, + "loss": 7.3485, + "loss/crossentropy": 2.312406063079834, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23344486206769943, + "step": 4176 + }, + { + "epoch": 0.261125, + "grad_norm": 2.203125, + "grad_norm_var": 0.033219401041666666, + "learning_rate": 0.0001, + "loss": 7.4603, + "loss/crossentropy": 2.3917373418807983, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22443928569555283, + "step": 4178 + }, + { + "epoch": 0.26125, + "grad_norm": 2.375, + "grad_norm_var": 0.03388570149739583, + "learning_rate": 0.0001, + "loss": 7.5113, + "loss/crossentropy": 2.3461934328079224, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2298140972852707, + "step": 4180 + }, + { + "epoch": 0.261375, + "grad_norm": 2.390625, + "grad_norm_var": 0.0312408447265625, + "learning_rate": 0.0001, + "loss": 7.5746, + "loss/crossentropy": 2.3137890100479126, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23146747052669525, + "step": 4182 + }, + { + "epoch": 0.2615, + "grad_norm": 2.53125, + "grad_norm_var": 0.03052978515625, + "learning_rate": 0.0001, + "loss": 7.5043, + "loss/crossentropy": 2.2696300745010376, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2407674714922905, + "step": 4184 + }, + { + "epoch": 0.261625, + "grad_norm": 2.234375, + "grad_norm_var": 0.03245340983072917, + "learning_rate": 0.0001, + "loss": 7.4266, + "loss/crossentropy": 2.3981817960739136, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2357579469680786, + "step": 4186 + }, + { + "epoch": 0.26175, + "grad_norm": 2.28125, + "grad_norm_var": 0.022826131184895834, + "learning_rate": 0.0001, + "loss": 7.2983, + "loss/crossentropy": 2.278700351715088, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.24243299663066864, + "step": 4188 + }, + { + "epoch": 0.261875, + "grad_norm": 2.65625, + "grad_norm_var": 0.0290191650390625, + "learning_rate": 0.0001, + "loss": 7.4591, + "loss/crossentropy": 2.4734745025634766, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2231806293129921, + "step": 4190 + }, + { + "epoch": 0.262, + "grad_norm": 2.5625, + "grad_norm_var": 0.03518473307291667, + "learning_rate": 0.0001, + "loss": 7.535, + "loss/crossentropy": 2.281826972961426, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2438691332936287, + "step": 4192 + }, + { + "epoch": 0.262125, + "grad_norm": 2.140625, + "grad_norm_var": 0.03855692545572917, + "learning_rate": 0.0001, + "loss": 7.4283, + "loss/crossentropy": 2.1762090921401978, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.19439425319433212, + "step": 4194 + }, + { + "epoch": 0.26225, + "grad_norm": 2.65625, + "grad_norm_var": 0.03613993326822917, + "learning_rate": 0.0001, + "loss": 7.5443, + "loss/crossentropy": 2.3899075984954834, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22394779324531555, + "step": 4196 + }, + { + "epoch": 0.262375, + "grad_norm": 2.21875, + "grad_norm_var": 0.03876953125, + "learning_rate": 0.0001, + "loss": 7.612, + "loss/crossentropy": 2.250289797782898, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22523928433656693, + "step": 4198 + }, + { + "epoch": 0.2625, + "grad_norm": 2.515625, + "grad_norm_var": 0.03834228515625, + "learning_rate": 0.0001, + "loss": 7.4899, + "loss/crossentropy": 2.0721304416656494, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.22663410007953644, + "step": 4200 + }, + { + "epoch": 0.262625, + "grad_norm": 2.421875, + "grad_norm_var": 0.0362945556640625, + "learning_rate": 0.0001, + "loss": 7.5435, + "loss/crossentropy": 2.1173004508018494, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20741065591573715, + "step": 4202 + }, + { + "epoch": 0.26275, + "grad_norm": 2.171875, + "grad_norm_var": 0.033186848958333334, + "learning_rate": 0.0001, + "loss": 7.5587, + "loss/crossentropy": 2.3617148399353027, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22693531960248947, + "step": 4204 + }, + { + "epoch": 0.262875, + "grad_norm": 2.390625, + "grad_norm_var": 0.029784138997395834, + "learning_rate": 0.0001, + "loss": 7.49, + "loss/crossentropy": 2.1566708087921143, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22167697548866272, + "step": 4206 + }, + { + "epoch": 0.263, + "grad_norm": 2.5, + "grad_norm_var": 0.022001139322916665, + "learning_rate": 0.0001, + "loss": 7.4201, + "loss/crossentropy": 2.2872395515441895, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23204121738672256, + "step": 4208 + }, + { + "epoch": 0.263125, + "grad_norm": 2.265625, + "grad_norm_var": 0.0179595947265625, + "learning_rate": 0.0001, + "loss": 7.4017, + "loss/crossentropy": 2.0254051089286804, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21901094913482666, + "step": 4210 + }, + { + "epoch": 0.26325, + "grad_norm": 2.109375, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 7.3872, + "loss/crossentropy": 2.1917017698287964, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2296266034245491, + "step": 4212 + }, + { + "epoch": 0.263375, + "grad_norm": 2.5, + "grad_norm_var": 0.019050089518229167, + "learning_rate": 0.0001, + "loss": 7.5185, + "loss/crossentropy": 2.329349994659424, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22277437150478363, + "step": 4214 + }, + { + "epoch": 0.2635, + "grad_norm": 2.21875, + "grad_norm_var": 0.019661458333333333, + "learning_rate": 0.0001, + "loss": 7.4769, + "loss/crossentropy": 2.3094968795776367, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22067157924175262, + "step": 4216 + }, + { + "epoch": 0.263625, + "grad_norm": 2.390625, + "grad_norm_var": 0.015999348958333333, + "learning_rate": 0.0001, + "loss": 7.5023, + "loss/crossentropy": 2.057901620864868, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.21499791741371155, + "step": 4218 + }, + { + "epoch": 0.26375, + "grad_norm": 2.203125, + "grad_norm_var": 0.015283203125, + "learning_rate": 0.0001, + "loss": 7.3844, + "loss/crossentropy": 2.2509127855300903, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21002254635095596, + "step": 4220 + }, + { + "epoch": 0.263875, + "grad_norm": 2.453125, + "grad_norm_var": 0.0160064697265625, + "learning_rate": 0.0001, + "loss": 7.3918, + "loss/crossentropy": 2.280747413635254, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21898390352725983, + "step": 4222 + }, + { + "epoch": 0.264, + "grad_norm": 2.25, + "grad_norm_var": 0.015672810872395835, + "learning_rate": 0.0001, + "loss": 7.5155, + "loss/crossentropy": 2.3069392442703247, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23423085361719131, + "step": 4224 + }, + { + "epoch": 0.264125, + "grad_norm": 2.3125, + "grad_norm_var": 0.015185546875, + "learning_rate": 0.0001, + "loss": 7.4396, + "loss/crossentropy": 2.0961318016052246, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.23950795829296112, + "step": 4226 + }, + { + "epoch": 0.26425, + "grad_norm": 2.265625, + "grad_norm_var": 0.012691243489583334, + "learning_rate": 0.0001, + "loss": 7.4114, + "loss/crossentropy": 2.2781461477279663, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23504098504781723, + "step": 4228 + }, + { + "epoch": 0.264375, + "grad_norm": 2.296875, + "grad_norm_var": 0.011181640625, + "learning_rate": 0.0001, + "loss": 7.4173, + "loss/crossentropy": 2.1842299699783325, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21509061753749847, + "step": 4230 + }, + { + "epoch": 0.2645, + "grad_norm": 2.328125, + "grad_norm_var": 0.008817545572916667, + "learning_rate": 0.0001, + "loss": 7.5102, + "loss/crossentropy": 2.3369181156158447, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23115838319063187, + "step": 4232 + }, + { + "epoch": 0.264625, + "grad_norm": 2.46875, + "grad_norm_var": 0.018033854166666665, + "learning_rate": 0.0001, + "loss": 7.6056, + "loss/crossentropy": 2.123593807220459, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21955028176307678, + "step": 4234 + }, + { + "epoch": 0.26475, + "grad_norm": 2.421875, + "grad_norm_var": 0.019579060872395835, + "learning_rate": 0.0001, + "loss": 7.5573, + "loss/crossentropy": 2.1841901540756226, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.20950836688280106, + "step": 4236 + }, + { + "epoch": 0.264875, + "grad_norm": 2.1875, + "grad_norm_var": 0.0201812744140625, + "learning_rate": 0.0001, + "loss": 7.4577, + "loss/crossentropy": 2.180688500404358, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21572377532720566, + "step": 4238 + }, + { + "epoch": 0.265, + "grad_norm": 2.296875, + "grad_norm_var": 0.0191314697265625, + "learning_rate": 0.0001, + "loss": 7.5163, + "loss/crossentropy": 2.5182595252990723, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.24843312799930573, + "step": 4240 + }, + { + "epoch": 0.265125, + "grad_norm": 2.4375, + "grad_norm_var": 0.020783487955729166, + "learning_rate": 0.0001, + "loss": 7.3331, + "loss/crossentropy": 2.3631211519241333, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22477340698242188, + "step": 4242 + }, + { + "epoch": 0.26525, + "grad_norm": 2.109375, + "grad_norm_var": 0.025874837239583334, + "learning_rate": 0.0001, + "loss": 7.3582, + "loss/crossentropy": 2.392832636833191, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23732294142246246, + "step": 4244 + }, + { + "epoch": 0.265375, + "grad_norm": 2.640625, + "grad_norm_var": 0.0303863525390625, + "learning_rate": 0.0001, + "loss": 7.5458, + "loss/crossentropy": 2.4172616004943848, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.25564996898174286, + "step": 4246 + }, + { + "epoch": 0.2655, + "grad_norm": 2.25, + "grad_norm_var": 0.030614217122395832, + "learning_rate": 0.0001, + "loss": 7.3352, + "loss/crossentropy": 2.068120002746582, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20718112587928772, + "step": 4248 + }, + { + "epoch": 0.265625, + "grad_norm": 2.203125, + "grad_norm_var": 0.025495402018229165, + "learning_rate": 0.0001, + "loss": 7.4747, + "loss/crossentropy": 2.3594859838485718, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22751520574092865, + "step": 4250 + }, + { + "epoch": 0.26575, + "grad_norm": 2.421875, + "grad_norm_var": 0.023298136393229165, + "learning_rate": 0.0001, + "loss": 7.3998, + "loss/crossentropy": 2.446703553199768, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23083661496639252, + "step": 4252 + }, + { + "epoch": 0.265875, + "grad_norm": 2.203125, + "grad_norm_var": 0.026398722330729166, + "learning_rate": 0.0001, + "loss": 7.3885, + "loss/crossentropy": 2.268368363380432, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.222725510597229, + "step": 4254 + }, + { + "epoch": 0.266, + "grad_norm": 2.1875, + "grad_norm_var": 0.028446451822916666, + "learning_rate": 0.0001, + "loss": 7.5755, + "loss/crossentropy": 2.152546525001526, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22324977815151215, + "step": 4256 + }, + { + "epoch": 0.266125, + "grad_norm": 2.3125, + "grad_norm_var": 0.026220703125, + "learning_rate": 0.0001, + "loss": 7.3993, + "loss/crossentropy": 2.101604700088501, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22246717661619186, + "step": 4258 + }, + { + "epoch": 0.26625, + "grad_norm": 2.703125, + "grad_norm_var": 0.029410807291666667, + "learning_rate": 0.0001, + "loss": 7.4041, + "loss/crossentropy": 2.2383921146392822, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22142166644334793, + "step": 4260 + }, + { + "epoch": 0.266375, + "grad_norm": 2.265625, + "grad_norm_var": 0.023681640625, + "learning_rate": 0.0001, + "loss": 7.5145, + "loss/crossentropy": 2.367012858390808, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22680655121803284, + "step": 4262 + }, + { + "epoch": 0.2665, + "grad_norm": 2.203125, + "grad_norm_var": 0.025732421875, + "learning_rate": 0.0001, + "loss": 7.2148, + "loss/crossentropy": 2.4683737754821777, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23372989892959595, + "step": 4264 + }, + { + "epoch": 0.266625, + "grad_norm": 2.5, + "grad_norm_var": 0.0251617431640625, + "learning_rate": 0.0001, + "loss": 7.4554, + "loss/crossentropy": 2.2976629734039307, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21854644268751144, + "step": 4266 + }, + { + "epoch": 0.26675, + "grad_norm": 2.3125, + "grad_norm_var": 0.025874837239583334, + "learning_rate": 0.0001, + "loss": 7.567, + "loss/crossentropy": 2.0734461545944214, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.20711353421211243, + "step": 4268 + }, + { + "epoch": 0.266875, + "grad_norm": 2.25, + "grad_norm_var": 0.023094685872395833, + "learning_rate": 0.0001, + "loss": 7.3456, + "loss/crossentropy": 2.2194411754608154, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2097087875008583, + "step": 4270 + }, + { + "epoch": 0.267, + "grad_norm": 2.375, + "grad_norm_var": 0.0214263916015625, + "learning_rate": 0.0001, + "loss": 7.3546, + "loss/crossentropy": 2.3072394132614136, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2229909971356392, + "step": 4272 + }, + { + "epoch": 0.267125, + "grad_norm": 2.546875, + "grad_norm_var": 0.0269195556640625, + "learning_rate": 0.0001, + "loss": 7.4624, + "loss/crossentropy": 2.3109829425811768, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2220280021429062, + "step": 4274 + }, + { + "epoch": 0.26725, + "grad_norm": 2.4375, + "grad_norm_var": 0.020921834309895835, + "learning_rate": 0.0001, + "loss": 7.5419, + "loss/crossentropy": 2.2977495193481445, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.24211393296718597, + "step": 4276 + }, + { + "epoch": 0.267375, + "grad_norm": 2.203125, + "grad_norm_var": 0.021605428059895834, + "learning_rate": 0.0001, + "loss": 7.5332, + "loss/crossentropy": 2.2791624069213867, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2135421559214592, + "step": 4278 + }, + { + "epoch": 0.2675, + "grad_norm": 2.5, + "grad_norm_var": 0.019489542643229166, + "learning_rate": 0.0001, + "loss": 7.5424, + "loss/crossentropy": 2.4423515796661377, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2313070297241211, + "step": 4280 + }, + { + "epoch": 0.267625, + "grad_norm": 2.3125, + "grad_norm_var": 0.021043904622395835, + "learning_rate": 0.0001, + "loss": 7.2078, + "loss/crossentropy": 2.1874340772628784, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.20884133875370026, + "step": 4282 + }, + { + "epoch": 0.26775, + "grad_norm": 2.4375, + "grad_norm_var": 0.0204986572265625, + "learning_rate": 0.0001, + "loss": 7.5178, + "loss/crossentropy": 2.2358198165893555, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.24393440037965775, + "step": 4284 + }, + { + "epoch": 0.267875, + "grad_norm": 2.28125, + "grad_norm_var": 0.021776326497395835, + "learning_rate": 0.0001, + "loss": 7.2812, + "loss/crossentropy": 2.064828336238861, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2007676362991333, + "step": 4286 + }, + { + "epoch": 0.268, + "grad_norm": 2.296875, + "grad_norm_var": 0.022786458333333332, + "learning_rate": 0.0001, + "loss": 7.4434, + "loss/crossentropy": 2.301008701324463, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23284630477428436, + "step": 4288 + }, + { + "epoch": 0.268125, + "grad_norm": 2.296875, + "grad_norm_var": 0.018538411458333334, + "learning_rate": 0.0001, + "loss": 7.3878, + "loss/crossentropy": 1.959551453590393, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.1952490210533142, + "step": 4290 + }, + { + "epoch": 0.26825, + "grad_norm": 2.375, + "grad_norm_var": 0.016890462239583334, + "learning_rate": 0.0001, + "loss": 7.5644, + "loss/crossentropy": 2.352171301841736, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2467745542526245, + "step": 4292 + }, + { + "epoch": 0.268375, + "grad_norm": 2.375, + "grad_norm_var": 0.015478515625, + "learning_rate": 0.0001, + "loss": 7.343, + "loss/crossentropy": 2.213460922241211, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2195800244808197, + "step": 4294 + }, + { + "epoch": 0.2685, + "grad_norm": 2.375, + "grad_norm_var": 0.0146484375, + "learning_rate": 0.0001, + "loss": 7.4869, + "loss/crossentropy": 2.042128264904022, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2317609265446663, + "step": 4296 + }, + { + "epoch": 0.268625, + "grad_norm": 2.25, + "grad_norm_var": 0.014769490559895833, + "learning_rate": 0.0001, + "loss": 7.3506, + "loss/crossentropy": 1.9961625337600708, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.19491659104824066, + "step": 4298 + }, + { + "epoch": 0.26875, + "grad_norm": 2.46875, + "grad_norm_var": 0.017508951822916667, + "learning_rate": 0.0001, + "loss": 7.468, + "loss/crossentropy": 2.118159055709839, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2275976911187172, + "step": 4300 + }, + { + "epoch": 0.268875, + "grad_norm": 2.421875, + "grad_norm_var": 0.015095011393229166, + "learning_rate": 0.0001, + "loss": 7.3515, + "loss/crossentropy": 2.206713318824768, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21720553189516068, + "step": 4302 + }, + { + "epoch": 0.269, + "grad_norm": 2.28125, + "grad_norm_var": 0.01480712890625, + "learning_rate": 0.0001, + "loss": 7.4099, + "loss/crossentropy": 2.2401949167251587, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21897585690021515, + "step": 4304 + }, + { + "epoch": 0.269125, + "grad_norm": 2.21875, + "grad_norm_var": 0.0141021728515625, + "learning_rate": 0.0001, + "loss": 7.3092, + "loss/crossentropy": 2.1088130474090576, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21423090249300003, + "step": 4306 + }, + { + "epoch": 0.26925, + "grad_norm": 2.546875, + "grad_norm_var": 0.014774576822916666, + "learning_rate": 0.0001, + "loss": 7.4478, + "loss/crossentropy": 2.2879436016082764, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.20457830280065536, + "step": 4308 + }, + { + "epoch": 0.269375, + "grad_norm": 2.46875, + "grad_norm_var": 0.01558837890625, + "learning_rate": 0.0001, + "loss": 7.4249, + "loss/crossentropy": 2.038703441619873, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21783077716827393, + "step": 4310 + }, + { + "epoch": 0.2695, + "grad_norm": 2.375, + "grad_norm_var": 0.01842041015625, + "learning_rate": 0.0001, + "loss": 7.6062, + "loss/crossentropy": 2.4648650884628296, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22937683761119843, + "step": 4312 + }, + { + "epoch": 0.269625, + "grad_norm": 2.25, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 7.4844, + "loss/crossentropy": 2.0551947951316833, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22646934539079666, + "step": 4314 + }, + { + "epoch": 0.26975, + "grad_norm": 2.375, + "grad_norm_var": 0.015165201822916667, + "learning_rate": 0.0001, + "loss": 7.4434, + "loss/crossentropy": 2.237433969974518, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22162948548793793, + "step": 4316 + }, + { + "epoch": 0.269875, + "grad_norm": 2.515625, + "grad_norm_var": 0.015543619791666666, + "learning_rate": 0.0001, + "loss": 7.5201, + "loss/crossentropy": 2.275120735168457, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23471694439649582, + "step": 4318 + }, + { + "epoch": 0.27, + "grad_norm": 2.078125, + "grad_norm_var": 0.019733683268229166, + "learning_rate": 0.0001, + "loss": 7.3651, + "loss/crossentropy": 2.355165123939514, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.25206050276756287, + "step": 4320 + }, + { + "epoch": 0.270125, + "grad_norm": 2.4375, + "grad_norm_var": 0.018602498372395835, + "learning_rate": 0.0001, + "loss": 7.5587, + "loss/crossentropy": 2.378506660461426, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22956155985593796, + "step": 4322 + }, + { + "epoch": 0.27025, + "grad_norm": 2.515625, + "grad_norm_var": 0.01636962890625, + "learning_rate": 0.0001, + "loss": 7.6102, + "loss/crossentropy": 2.2196805477142334, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23681814968585968, + "step": 4324 + }, + { + "epoch": 0.270375, + "grad_norm": 2.5625, + "grad_norm_var": 0.0182525634765625, + "learning_rate": 0.0001, + "loss": 7.3513, + "loss/crossentropy": 2.1617177724838257, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.19959037005901337, + "step": 4326 + }, + { + "epoch": 0.2705, + "grad_norm": 2.421875, + "grad_norm_var": 0.022386678059895835, + "learning_rate": 0.0001, + "loss": 7.4452, + "loss/crossentropy": 2.2576276063919067, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2321421429514885, + "step": 4328 + }, + { + "epoch": 0.270625, + "grad_norm": 2.140625, + "grad_norm_var": 0.025267537434895834, + "learning_rate": 0.0001, + "loss": 7.36, + "loss/crossentropy": 2.213522434234619, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2282974198460579, + "step": 4330 + }, + { + "epoch": 0.27075, + "grad_norm": 2.21875, + "grad_norm_var": 0.028880818684895834, + "learning_rate": 0.0001, + "loss": 7.3864, + "loss/crossentropy": 1.9888715744018555, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21721196174621582, + "step": 4332 + }, + { + "epoch": 0.270875, + "grad_norm": 2.515625, + "grad_norm_var": 0.028706868489583332, + "learning_rate": 0.0001, + "loss": 7.3121, + "loss/crossentropy": 2.3635976314544678, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22595931589603424, + "step": 4334 + }, + { + "epoch": 0.271, + "grad_norm": 2.390625, + "grad_norm_var": 0.022997029622395835, + "learning_rate": 0.0001, + "loss": 7.5235, + "loss/crossentropy": 2.4795159101486206, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23015367984771729, + "step": 4336 + }, + { + "epoch": 0.271125, + "grad_norm": 2.984375, + "grad_norm_var": 0.047998046875, + "learning_rate": 0.0001, + "loss": 7.4249, + "loss/crossentropy": 2.234145760536194, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20242837071418762, + "step": 4338 + }, + { + "epoch": 0.27125, + "grad_norm": 2.46875, + "grad_norm_var": 0.04712626139322917, + "learning_rate": 0.0001, + "loss": 7.5575, + "loss/crossentropy": 2.2627410888671875, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2454666718840599, + "step": 4340 + }, + { + "epoch": 0.271375, + "grad_norm": 2.34375, + "grad_norm_var": 0.04541727701822917, + "learning_rate": 0.0001, + "loss": 7.3373, + "loss/crossentropy": 2.4094841480255127, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22295518964529037, + "step": 4342 + }, + { + "epoch": 0.2715, + "grad_norm": 2.328125, + "grad_norm_var": 0.03918863932291667, + "learning_rate": 0.0001, + "loss": 7.489, + "loss/crossentropy": 2.485226035118103, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2179865539073944, + "step": 4344 + }, + { + "epoch": 0.271625, + "grad_norm": 2.3125, + "grad_norm_var": 0.03469645182291667, + "learning_rate": 0.0001, + "loss": 7.4442, + "loss/crossentropy": 2.3933740854263306, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.24797789752483368, + "step": 4346 + }, + { + "epoch": 0.27175, + "grad_norm": 2.359375, + "grad_norm_var": 0.028246053059895835, + "learning_rate": 0.0001, + "loss": 7.4128, + "loss/crossentropy": 2.136129140853882, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.20815995335578918, + "step": 4348 + }, + { + "epoch": 0.271875, + "grad_norm": 2.234375, + "grad_norm_var": 0.029474894205729168, + "learning_rate": 0.0001, + "loss": 7.3621, + "loss/crossentropy": 2.2934054136276245, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22534728795289993, + "step": 4350 + }, + { + "epoch": 0.272, + "grad_norm": 2.40625, + "grad_norm_var": 0.029808553059895833, + "learning_rate": 0.0001, + "loss": 7.7497, + "loss/crossentropy": 2.2244023084640503, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24372966587543488, + "step": 4352 + }, + { + "epoch": 0.272125, + "grad_norm": 2.375, + "grad_norm_var": 0.004515584309895833, + "learning_rate": 0.0001, + "loss": 7.5103, + "loss/crossentropy": 2.2522358894348145, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2237512692809105, + "step": 4354 + }, + { + "epoch": 0.27225, + "grad_norm": 2.53125, + "grad_norm_var": 0.005533854166666667, + "learning_rate": 0.0001, + "loss": 7.5503, + "loss/crossentropy": 2.006240487098694, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21218693256378174, + "step": 4356 + }, + { + "epoch": 0.272375, + "grad_norm": 2.34375, + "grad_norm_var": 0.007356770833333333, + "learning_rate": 0.0001, + "loss": 7.4, + "loss/crossentropy": 2.1753203868865967, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21662192791700363, + "step": 4358 + }, + { + "epoch": 0.2725, + "grad_norm": 2.1875, + "grad_norm_var": 0.008317057291666667, + "learning_rate": 0.0001, + "loss": 7.4924, + "loss/crossentropy": 2.5050474405288696, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23573968559503555, + "step": 4360 + }, + { + "epoch": 0.272625, + "grad_norm": 2.1875, + "grad_norm_var": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 7.2394, + "loss/crossentropy": 2.2561826705932617, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2149110957980156, + "step": 4362 + }, + { + "epoch": 0.27275, + "grad_norm": 2.34375, + "grad_norm_var": 0.011262003580729167, + "learning_rate": 0.0001, + "loss": 7.5461, + "loss/crossentropy": 2.4560667276382446, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22948572784662247, + "step": 4364 + }, + { + "epoch": 0.272875, + "grad_norm": 2.328125, + "grad_norm_var": 0.010895792643229167, + "learning_rate": 0.0001, + "loss": 7.5201, + "loss/crossentropy": 2.2445766925811768, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.24865839630365372, + "step": 4366 + }, + { + "epoch": 0.273, + "grad_norm": 2.484375, + "grad_norm_var": 0.010602823893229167, + "learning_rate": 0.0001, + "loss": 7.3433, + "loss/crossentropy": 2.048405647277832, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21950747072696686, + "step": 4368 + }, + { + "epoch": 0.273125, + "grad_norm": 2.1875, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 7.478, + "loss/crossentropy": 2.5548731088638306, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.24532774090766907, + "step": 4370 + }, + { + "epoch": 0.27325, + "grad_norm": 2.328125, + "grad_norm_var": 0.0150390625, + "learning_rate": 0.0001, + "loss": 7.4301, + "loss/crossentropy": 2.158105969429016, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2423110455274582, + "step": 4372 + }, + { + "epoch": 0.273375, + "grad_norm": 2.375, + "grad_norm_var": 0.014518229166666667, + "learning_rate": 0.0001, + "loss": 7.5337, + "loss/crossentropy": 2.3834049701690674, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2357938289642334, + "step": 4374 + }, + { + "epoch": 0.2735, + "grad_norm": 2.578125, + "grad_norm_var": 0.017692057291666667, + "learning_rate": 0.0001, + "loss": 7.3858, + "loss/crossentropy": 2.2216193675994873, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23488235473632812, + "step": 4376 + }, + { + "epoch": 0.273625, + "grad_norm": 2.3125, + "grad_norm_var": 0.04396870930989583, + "learning_rate": 0.0001, + "loss": 7.4476, + "loss/crossentropy": 2.11636745929718, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21510595828294754, + "step": 4378 + }, + { + "epoch": 0.27375, + "grad_norm": 2.203125, + "grad_norm_var": 0.04899800618489583, + "learning_rate": 0.0001, + "loss": 7.572, + "loss/crossentropy": 2.2707090377807617, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.22173020988702774, + "step": 4380 + }, + { + "epoch": 0.273875, + "grad_norm": 2.203125, + "grad_norm_var": 0.05029296875, + "learning_rate": 0.0001, + "loss": 7.3261, + "loss/crossentropy": 2.270142912864685, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21372423321008682, + "step": 4382 + }, + { + "epoch": 0.274, + "grad_norm": 2.484375, + "grad_norm_var": 0.053120930989583336, + "learning_rate": 0.0001, + "loss": 7.4542, + "loss/crossentropy": 2.403599977493286, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20995523035526276, + "step": 4384 + }, + { + "epoch": 0.274125, + "grad_norm": 2.484375, + "grad_norm_var": 0.049071248372395834, + "learning_rate": 0.0001, + "loss": 7.6809, + "loss/crossentropy": 2.2590737342834473, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22851084172725677, + "step": 4386 + }, + { + "epoch": 0.27425, + "grad_norm": 2.5, + "grad_norm_var": 0.046647135416666666, + "learning_rate": 0.0001, + "loss": 7.5104, + "loss/crossentropy": 2.303091526031494, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.24387390911579132, + "step": 4388 + }, + { + "epoch": 0.274375, + "grad_norm": 2.59375, + "grad_norm_var": 0.05137430826822917, + "learning_rate": 0.0001, + "loss": 7.5524, + "loss/crossentropy": 2.3587981462478638, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2259790152311325, + "step": 4390 + }, + { + "epoch": 0.2745, + "grad_norm": 2.1875, + "grad_norm_var": 0.05120442708333333, + "learning_rate": 0.0001, + "loss": 7.5764, + "loss/crossentropy": 2.26511013507843, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2363736778497696, + "step": 4392 + }, + { + "epoch": 0.274625, + "grad_norm": 2.28125, + "grad_norm_var": 0.028758748372395834, + "learning_rate": 0.0001, + "loss": 7.2611, + "loss/crossentropy": 2.1224186420440674, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.21887023746967316, + "step": 4394 + }, + { + "epoch": 0.27475, + "grad_norm": 2.34375, + "grad_norm_var": 0.024665323893229167, + "learning_rate": 0.0001, + "loss": 7.411, + "loss/crossentropy": 2.4636008739471436, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22841719537973404, + "step": 4396 + }, + { + "epoch": 0.274875, + "grad_norm": 2.390625, + "grad_norm_var": 0.022785441080729166, + "learning_rate": 0.0001, + "loss": 7.4425, + "loss/crossentropy": 2.098508358001709, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.24196244776248932, + "step": 4398 + }, + { + "epoch": 0.275, + "grad_norm": 2.296875, + "grad_norm_var": 0.020856730143229165, + "learning_rate": 0.0001, + "loss": 7.6109, + "loss/crossentropy": 2.245239734649658, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2130843922495842, + "step": 4400 + }, + { + "epoch": 0.275125, + "grad_norm": 2.234375, + "grad_norm_var": 0.025374348958333334, + "learning_rate": 0.0001, + "loss": 7.3746, + "loss/crossentropy": 2.1706331968307495, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21158546209335327, + "step": 4402 + }, + { + "epoch": 0.27525, + "grad_norm": 2.296875, + "grad_norm_var": 0.026200358072916666, + "learning_rate": 0.0001, + "loss": 7.2704, + "loss/crossentropy": 2.114712119102478, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21735002845525742, + "step": 4404 + }, + { + "epoch": 0.275375, + "grad_norm": 2.375, + "grad_norm_var": 0.01597900390625, + "learning_rate": 0.0001, + "loss": 7.4371, + "loss/crossentropy": 2.4146808385849, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2161477878689766, + "step": 4406 + }, + { + "epoch": 0.2755, + "grad_norm": 2.28125, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 7.5387, + "loss/crossentropy": 2.5566210746765137, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23766440898180008, + "step": 4408 + }, + { + "epoch": 0.275625, + "grad_norm": 2.125, + "grad_norm_var": 0.01412353515625, + "learning_rate": 0.0001, + "loss": 7.3358, + "loss/crossentropy": 2.216295003890991, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21209152787923813, + "step": 4410 + }, + { + "epoch": 0.27575, + "grad_norm": 2.328125, + "grad_norm_var": 0.01796875, + "learning_rate": 0.0001, + "loss": 7.5077, + "loss/crossentropy": 2.306955099105835, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2539971098303795, + "step": 4412 + }, + { + "epoch": 0.275875, + "grad_norm": 2.328125, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 7.4567, + "loss/crossentropy": 2.462777853012085, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21741768717765808, + "step": 4414 + }, + { + "epoch": 0.276, + "grad_norm": 2.734375, + "grad_norm_var": 0.0329010009765625, + "learning_rate": 0.0001, + "loss": 7.5509, + "loss/crossentropy": 2.155795156955719, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22352226078510284, + "step": 4416 + }, + { + "epoch": 0.276125, + "grad_norm": 3.234375, + "grad_norm_var": 0.088720703125, + "learning_rate": 0.0001, + "loss": 7.4472, + "loss/crossentropy": 2.4492448568344116, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.22944186627864838, + "step": 4418 + }, + { + "epoch": 0.27625, + "grad_norm": 3.453125, + "grad_norm_var": 0.14516499837239583, + "learning_rate": 0.0001, + "loss": 7.4556, + "loss/crossentropy": 2.296768367290497, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2189040184020996, + "step": 4420 + }, + { + "epoch": 0.276375, + "grad_norm": 2.1875, + "grad_norm_var": 0.1495758056640625, + "learning_rate": 0.0001, + "loss": 7.4439, + "loss/crossentropy": 2.4126769304275513, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.23999958485364914, + "step": 4422 + }, + { + "epoch": 0.2765, + "grad_norm": 2.40625, + "grad_norm_var": 0.14530843098958332, + "learning_rate": 0.0001, + "loss": 7.501, + "loss/crossentropy": 2.47876238822937, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.23916874080896378, + "step": 4424 + }, + { + "epoch": 0.276625, + "grad_norm": 2.40625, + "grad_norm_var": 0.138720703125, + "learning_rate": 0.0001, + "loss": 7.5551, + "loss/crossentropy": 2.282703399658203, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2197970598936081, + "step": 4426 + }, + { + "epoch": 0.27675, + "grad_norm": 2.171875, + "grad_norm_var": 0.1488677978515625, + "learning_rate": 0.0001, + "loss": 7.4234, + "loss/crossentropy": 2.26338267326355, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22207339107990265, + "step": 4428 + }, + { + "epoch": 0.276875, + "grad_norm": 4.59375, + "grad_norm_var": 0.3890462239583333, + "learning_rate": 0.0001, + "loss": 7.5694, + "loss/crossentropy": 2.34802508354187, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23877909779548645, + "step": 4430 + }, + { + "epoch": 0.277, + "grad_norm": 2.28125, + "grad_norm_var": 0.4018951416015625, + "learning_rate": 0.0001, + "loss": 7.6152, + "loss/crossentropy": 2.4531562328338623, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.24313053488731384, + "step": 4432 + }, + { + "epoch": 0.277125, + "grad_norm": 2.265625, + "grad_norm_var": 0.39807535807291666, + "learning_rate": 0.0001, + "loss": 7.2933, + "loss/crossentropy": 2.1458455324172974, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21921583265066147, + "step": 4434 + }, + { + "epoch": 0.27725, + "grad_norm": 2.453125, + "grad_norm_var": 0.34549153645833336, + "learning_rate": 0.0001, + "loss": 7.3935, + "loss/crossentropy": 2.3641769886016846, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2235148400068283, + "step": 4436 + }, + { + "epoch": 0.277375, + "grad_norm": 2.21875, + "grad_norm_var": 0.3485026041666667, + "learning_rate": 0.0001, + "loss": 7.4657, + "loss/crossentropy": 2.2699997425079346, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2232022061944008, + "step": 4438 + }, + { + "epoch": 0.2775, + "grad_norm": 2.453125, + "grad_norm_var": 0.3463043212890625, + "learning_rate": 0.0001, + "loss": 7.489, + "loss/crossentropy": 2.2820075154304504, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22474562376737595, + "step": 4440 + }, + { + "epoch": 0.277625, + "grad_norm": 2.28125, + "grad_norm_var": 0.34402669270833336, + "learning_rate": 0.0001, + "loss": 7.2763, + "loss/crossentropy": 2.3725186586380005, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2197442501783371, + "step": 4442 + }, + { + "epoch": 0.27775, + "grad_norm": 2.40625, + "grad_norm_var": 0.33488667805989586, + "learning_rate": 0.0001, + "loss": 7.3046, + "loss/crossentropy": 2.0243775248527527, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20709318667650223, + "step": 4444 + }, + { + "epoch": 0.277875, + "grad_norm": 2.5, + "grad_norm_var": 0.019928995768229166, + "learning_rate": 0.0001, + "loss": 7.5691, + "loss/crossentropy": 2.267683506011963, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.23416732996702194, + "step": 4446 + }, + { + "epoch": 0.278, + "grad_norm": 2.296875, + "grad_norm_var": 0.015241495768229167, + "learning_rate": 0.0001, + "loss": 7.5059, + "loss/crossentropy": 2.4565316438674927, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23150788247585297, + "step": 4448 + }, + { + "epoch": 0.278125, + "grad_norm": 2.359375, + "grad_norm_var": 0.015132649739583334, + "learning_rate": 0.0001, + "loss": 7.3083, + "loss/crossentropy": 2.2579123973846436, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23325718939304352, + "step": 4450 + }, + { + "epoch": 0.27825, + "grad_norm": 2.3125, + "grad_norm_var": 0.015819295247395834, + "learning_rate": 0.0001, + "loss": 7.4666, + "loss/crossentropy": 2.2895156145095825, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23578675836324692, + "step": 4452 + }, + { + "epoch": 0.278375, + "grad_norm": 2.40625, + "grad_norm_var": 0.014232381184895834, + "learning_rate": 0.0001, + "loss": 7.3926, + "loss/crossentropy": 2.4191900491714478, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.23101849853992462, + "step": 4454 + }, + { + "epoch": 0.2785, + "grad_norm": 2.625, + "grad_norm_var": 0.01796875, + "learning_rate": 0.0001, + "loss": 7.5229, + "loss/crossentropy": 2.1673884987831116, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21298403292894363, + "step": 4456 + }, + { + "epoch": 0.278625, + "grad_norm": 2.28125, + "grad_norm_var": 0.02232666015625, + "learning_rate": 0.0001, + "loss": 7.2791, + "loss/crossentropy": 1.9522064924240112, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1917770504951477, + "step": 4458 + }, + { + "epoch": 0.27875, + "grad_norm": 2.234375, + "grad_norm_var": 0.02037353515625, + "learning_rate": 0.0001, + "loss": 7.3455, + "loss/crossentropy": 2.064726769924164, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2061791568994522, + "step": 4460 + }, + { + "epoch": 0.278875, + "grad_norm": 2.21875, + "grad_norm_var": 0.0209869384765625, + "learning_rate": 0.0001, + "loss": 7.5458, + "loss/crossentropy": 2.57345187664032, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.24028793722391129, + "step": 4462 + }, + { + "epoch": 0.279, + "grad_norm": 2.296875, + "grad_norm_var": 0.021092732747395832, + "learning_rate": 0.0001, + "loss": 7.52, + "loss/crossentropy": 2.1306859254837036, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21655796468257904, + "step": 4464 + }, + { + "epoch": 0.279125, + "grad_norm": 2.203125, + "grad_norm_var": 0.019237263997395834, + "learning_rate": 0.0001, + "loss": 7.3362, + "loss/crossentropy": 2.0370622873306274, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20722102373838425, + "step": 4466 + }, + { + "epoch": 0.27925, + "grad_norm": 2.234375, + "grad_norm_var": 0.023942057291666666, + "learning_rate": 0.0001, + "loss": 7.3497, + "loss/crossentropy": 2.3192614316940308, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2113727256655693, + "step": 4468 + }, + { + "epoch": 0.279375, + "grad_norm": 2.390625, + "grad_norm_var": 0.0229400634765625, + "learning_rate": 0.0001, + "loss": 7.3515, + "loss/crossentropy": 2.378154754638672, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23498709499835968, + "step": 4470 + }, + { + "epoch": 0.2795, + "grad_norm": 2.234375, + "grad_norm_var": 0.017284138997395834, + "learning_rate": 0.0001, + "loss": 7.3089, + "loss/crossentropy": 2.3729015588760376, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.24130545556545258, + "step": 4472 + }, + { + "epoch": 0.279625, + "grad_norm": 2.203125, + "grad_norm_var": 0.009761555989583334, + "learning_rate": 0.0001, + "loss": 7.37, + "loss/crossentropy": 2.139391541481018, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.24066069722175598, + "step": 4474 + }, + { + "epoch": 0.27975, + "grad_norm": 2.234375, + "grad_norm_var": 0.0101226806640625, + "learning_rate": 0.0001, + "loss": 7.3748, + "loss/crossentropy": 2.561732530593872, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2334682047367096, + "step": 4476 + }, + { + "epoch": 0.279875, + "grad_norm": 2.375, + "grad_norm_var": 0.0133453369140625, + "learning_rate": 0.0001, + "loss": 7.4443, + "loss/crossentropy": 2.5658878087997437, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22343529760837555, + "step": 4478 + }, + { + "epoch": 0.28, + "grad_norm": 2.234375, + "grad_norm_var": 0.014322916666666666, + "learning_rate": 0.0001, + "loss": 7.2706, + "loss/crossentropy": 2.389992117881775, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2276037633419037, + "step": 4480 + }, + { + "epoch": 0.280125, + "grad_norm": 2.28125, + "grad_norm_var": 0.022044881184895834, + "learning_rate": 0.0001, + "loss": 7.5801, + "loss/crossentropy": 2.496686816215515, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2241569608449936, + "step": 4482 + }, + { + "epoch": 0.28025, + "grad_norm": 2.265625, + "grad_norm_var": 0.017658487955729166, + "learning_rate": 0.0001, + "loss": 7.3432, + "loss/crossentropy": 2.33056378364563, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.23244759440422058, + "step": 4484 + }, + { + "epoch": 0.280375, + "grad_norm": 2.296875, + "grad_norm_var": 0.017313639322916668, + "learning_rate": 0.0001, + "loss": 7.4858, + "loss/crossentropy": 2.2192925214767456, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21771762520074844, + "step": 4486 + }, + { + "epoch": 0.2805, + "grad_norm": 2.296875, + "grad_norm_var": 0.017902628580729166, + "learning_rate": 0.0001, + "loss": 7.3754, + "loss/crossentropy": 2.1574283838272095, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20880410075187683, + "step": 4488 + }, + { + "epoch": 0.280625, + "grad_norm": 2.1875, + "grad_norm_var": 0.019266764322916668, + "learning_rate": 0.0001, + "loss": 7.4153, + "loss/crossentropy": 2.3507992029190063, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23332297801971436, + "step": 4490 + }, + { + "epoch": 0.28075, + "grad_norm": 2.5, + "grad_norm_var": 0.021833292643229165, + "learning_rate": 0.0001, + "loss": 7.4313, + "loss/crossentropy": 2.2984365224838257, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.22823016345500946, + "step": 4492 + }, + { + "epoch": 0.280875, + "grad_norm": 2.125, + "grad_norm_var": 0.021076456705729166, + "learning_rate": 0.0001, + "loss": 7.2686, + "loss/crossentropy": 2.104506731033325, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22106795012950897, + "step": 4494 + }, + { + "epoch": 0.281, + "grad_norm": 2.109375, + "grad_norm_var": 0.022196451822916668, + "learning_rate": 0.0001, + "loss": 7.537, + "loss/crossentropy": 2.197329521179199, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.222248375415802, + "step": 4496 + }, + { + "epoch": 0.281125, + "grad_norm": 2.28125, + "grad_norm_var": 0.012995402018229166, + "learning_rate": 0.0001, + "loss": 7.3045, + "loss/crossentropy": 2.2515393495559692, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22155503183603287, + "step": 4498 + }, + { + "epoch": 0.28125, + "grad_norm": 2.53125, + "grad_norm_var": 0.019986979166666665, + "learning_rate": 0.0001, + "loss": 7.3329, + "loss/crossentropy": 2.1618112325668335, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22284910082817078, + "step": 4500 + }, + { + "epoch": 0.281375, + "grad_norm": 2.109375, + "grad_norm_var": 0.0228424072265625, + "learning_rate": 0.0001, + "loss": 7.2701, + "loss/crossentropy": 2.088331937789917, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23195888847112656, + "step": 4502 + }, + { + "epoch": 0.2815, + "grad_norm": 2.578125, + "grad_norm_var": 0.026529947916666668, + "learning_rate": 0.0001, + "loss": 7.4928, + "loss/crossentropy": 2.327447533607483, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22215363383293152, + "step": 4504 + }, + { + "epoch": 0.281625, + "grad_norm": 2.21875, + "grad_norm_var": 0.026334635416666665, + "learning_rate": 0.0001, + "loss": 7.4629, + "loss/crossentropy": 2.294684410095215, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2363322228193283, + "step": 4506 + }, + { + "epoch": 0.28175, + "grad_norm": 2.25, + "grad_norm_var": 0.02236328125, + "learning_rate": 0.0001, + "loss": 7.4728, + "loss/crossentropy": 2.3789936304092407, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22490855306386948, + "step": 4508 + }, + { + "epoch": 0.281875, + "grad_norm": 2.296875, + "grad_norm_var": 0.019554646809895833, + "learning_rate": 0.0001, + "loss": 7.45, + "loss/crossentropy": 2.167789340019226, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21532851457595825, + "step": 4510 + }, + { + "epoch": 0.282, + "grad_norm": 2.265625, + "grad_norm_var": 0.016795857747395834, + "learning_rate": 0.0001, + "loss": 7.3859, + "loss/crossentropy": 2.197750449180603, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22491279989480972, + "step": 4512 + }, + { + "epoch": 0.282125, + "grad_norm": 2.171875, + "grad_norm_var": 0.017951456705729167, + "learning_rate": 0.0001, + "loss": 7.3363, + "loss/crossentropy": 2.168722629547119, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2121475487947464, + "step": 4514 + }, + { + "epoch": 0.28225, + "grad_norm": 2.4375, + "grad_norm_var": 0.013407389322916666, + "learning_rate": 0.0001, + "loss": 7.3139, + "loss/crossentropy": 2.154773235321045, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22038134932518005, + "step": 4516 + }, + { + "epoch": 0.282375, + "grad_norm": 2.390625, + "grad_norm_var": 0.01109619140625, + "learning_rate": 0.0001, + "loss": 7.4864, + "loss/crossentropy": 2.1986958980560303, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.22602716833353043, + "step": 4518 + }, + { + "epoch": 0.2825, + "grad_norm": 2.4375, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 7.2571, + "loss/crossentropy": 2.1418489813804626, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20537669956684113, + "step": 4520 + }, + { + "epoch": 0.282625, + "grad_norm": 2.296875, + "grad_norm_var": 0.010081990559895834, + "learning_rate": 0.0001, + "loss": 7.478, + "loss/crossentropy": 2.152292013168335, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21266800165176392, + "step": 4522 + }, + { + "epoch": 0.28275, + "grad_norm": 2.265625, + "grad_norm_var": 0.012181599934895834, + "learning_rate": 0.0001, + "loss": 7.3898, + "loss/crossentropy": 2.3404839038848877, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21193323284387589, + "step": 4524 + }, + { + "epoch": 0.282875, + "grad_norm": 2.59375, + "grad_norm_var": 0.0173004150390625, + "learning_rate": 0.0001, + "loss": 7.5024, + "loss/crossentropy": 2.012498140335083, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21007203310728073, + "step": 4526 + }, + { + "epoch": 0.283, + "grad_norm": 2.328125, + "grad_norm_var": 0.016975911458333333, + "learning_rate": 0.0001, + "loss": 7.3939, + "loss/crossentropy": 2.218207001686096, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23073484003543854, + "step": 4528 + }, + { + "epoch": 0.283125, + "grad_norm": 2.21875, + "grad_norm_var": 0.016185506184895834, + "learning_rate": 0.0001, + "loss": 7.519, + "loss/crossentropy": 2.3596348762512207, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2409394308924675, + "step": 4530 + }, + { + "epoch": 0.28325, + "grad_norm": 2.546875, + "grad_norm_var": 0.016502888997395833, + "learning_rate": 0.0001, + "loss": 7.4134, + "loss/crossentropy": 2.223081946372986, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22396845370531082, + "step": 4532 + }, + { + "epoch": 0.283375, + "grad_norm": 2.3125, + "grad_norm_var": 0.019755045572916668, + "learning_rate": 0.0001, + "loss": 7.3217, + "loss/crossentropy": 2.258087635040283, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21903490275144577, + "step": 4534 + }, + { + "epoch": 0.2835, + "grad_norm": 2.421875, + "grad_norm_var": 0.022379557291666668, + "learning_rate": 0.0001, + "loss": 7.3878, + "loss/crossentropy": 2.3071266412734985, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.21915674209594727, + "step": 4536 + }, + { + "epoch": 0.283625, + "grad_norm": 2.296875, + "grad_norm_var": 0.020035807291666666, + "learning_rate": 0.0001, + "loss": 7.4188, + "loss/crossentropy": 2.1818684339523315, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.22117509692907333, + "step": 4538 + }, + { + "epoch": 0.28375, + "grad_norm": 2.453125, + "grad_norm_var": 0.015851847330729165, + "learning_rate": 0.0001, + "loss": 7.4492, + "loss/crossentropy": 2.411054253578186, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22131475806236267, + "step": 4540 + }, + { + "epoch": 0.283875, + "grad_norm": 2.3125, + "grad_norm_var": 0.0133209228515625, + "learning_rate": 0.0001, + "loss": 7.4197, + "loss/crossentropy": 2.2065125703811646, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21521207690238953, + "step": 4542 + }, + { + "epoch": 0.284, + "grad_norm": 2.453125, + "grad_norm_var": 0.0134918212890625, + "learning_rate": 0.0001, + "loss": 7.4471, + "loss/crossentropy": 2.266932725906372, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22295700013637543, + "step": 4544 + }, + { + "epoch": 0.284125, + "grad_norm": 2.328125, + "grad_norm_var": 0.011872355143229167, + "learning_rate": 0.0001, + "loss": 7.5007, + "loss/crossentropy": 2.360601305961609, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23160873353481293, + "step": 4546 + }, + { + "epoch": 0.28425, + "grad_norm": 2.21875, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 7.3673, + "loss/crossentropy": 2.1906508207321167, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21694105863571167, + "step": 4548 + }, + { + "epoch": 0.284375, + "grad_norm": 2.328125, + "grad_norm_var": 0.011649576822916667, + "learning_rate": 0.0001, + "loss": 7.4986, + "loss/crossentropy": 2.348879337310791, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.213335782289505, + "step": 4550 + }, + { + "epoch": 0.2845, + "grad_norm": 2.390625, + "grad_norm_var": 0.008128865559895834, + "learning_rate": 0.0001, + "loss": 7.5221, + "loss/crossentropy": 2.343509554862976, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.24417825788259506, + "step": 4552 + }, + { + "epoch": 0.284625, + "grad_norm": 2.1875, + "grad_norm_var": 0.008524576822916666, + "learning_rate": 0.0001, + "loss": 7.4006, + "loss/crossentropy": 2.243224263191223, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21264301240444183, + "step": 4554 + }, + { + "epoch": 0.28475, + "grad_norm": 2.5, + "grad_norm_var": 0.011823527018229167, + "learning_rate": 0.0001, + "loss": 7.4715, + "loss/crossentropy": 2.383206009864807, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2533396929502487, + "step": 4556 + }, + { + "epoch": 0.284875, + "grad_norm": 2.140625, + "grad_norm_var": 0.013695271809895833, + "learning_rate": 0.0001, + "loss": 7.2299, + "loss/crossentropy": 2.2882405519485474, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.23156971484422684, + "step": 4558 + }, + { + "epoch": 0.285, + "grad_norm": 2.25, + "grad_norm_var": 0.012548828125, + "learning_rate": 0.0001, + "loss": 7.4714, + "loss/crossentropy": 2.3273061513900757, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2340613454580307, + "step": 4560 + }, + { + "epoch": 0.285125, + "grad_norm": 2.1875, + "grad_norm_var": 0.01236572265625, + "learning_rate": 0.0001, + "loss": 7.2556, + "loss/crossentropy": 2.206205368041992, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.21796930581331253, + "step": 4562 + }, + { + "epoch": 0.28525, + "grad_norm": 2.328125, + "grad_norm_var": 0.0104644775390625, + "learning_rate": 0.0001, + "loss": 7.3176, + "loss/crossentropy": 2.26140820980072, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21520189940929413, + "step": 4564 + }, + { + "epoch": 0.285375, + "grad_norm": 2.28125, + "grad_norm_var": 0.01904296875, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 2.1846988201141357, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23294325172901154, + "step": 4566 + }, + { + "epoch": 0.2855, + "grad_norm": 2.484375, + "grad_norm_var": 0.020262654622395834, + "learning_rate": 0.0001, + "loss": 7.3939, + "loss/crossentropy": 2.1126757860183716, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20701631158590317, + "step": 4568 + }, + { + "epoch": 0.285625, + "grad_norm": 2.828125, + "grad_norm_var": 0.03251851399739583, + "learning_rate": 0.0001, + "loss": 7.4807, + "loss/crossentropy": 2.407546877861023, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22992483526468277, + "step": 4570 + }, + { + "epoch": 0.28575, + "grad_norm": 2.296875, + "grad_norm_var": 0.0304840087890625, + "learning_rate": 0.0001, + "loss": 7.2665, + "loss/crossentropy": 2.281604766845703, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2113427072763443, + "step": 4572 + }, + { + "epoch": 0.285875, + "grad_norm": 2.3125, + "grad_norm_var": 0.0279205322265625, + "learning_rate": 0.0001, + "loss": 7.5764, + "loss/crossentropy": 2.199875235557556, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21796562522649765, + "step": 4574 + }, + { + "epoch": 0.286, + "grad_norm": 2.296875, + "grad_norm_var": 0.027318318684895832, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 2.4474005699157715, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22615059465169907, + "step": 4576 + }, + { + "epoch": 0.286125, + "grad_norm": 2.234375, + "grad_norm_var": 0.025809733072916667, + "learning_rate": 0.0001, + "loss": 7.4363, + "loss/crossentropy": 2.554097890853882, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2136968970298767, + "step": 4578 + }, + { + "epoch": 0.28625, + "grad_norm": 2.375, + "grad_norm_var": 0.028218587239583332, + "learning_rate": 0.0001, + "loss": 7.3431, + "loss/crossentropy": 2.4345656633377075, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.22010788321495056, + "step": 4580 + }, + { + "epoch": 0.286375, + "grad_norm": 2.359375, + "grad_norm_var": 0.021240234375, + "learning_rate": 0.0001, + "loss": 7.6311, + "loss/crossentropy": 2.0311567187309265, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21834005415439606, + "step": 4582 + }, + { + "epoch": 0.2865, + "grad_norm": 2.5625, + "grad_norm_var": 0.024019368489583335, + "learning_rate": 0.0001, + "loss": 7.5513, + "loss/crossentropy": 2.2485233545303345, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23373603075742722, + "step": 4584 + }, + { + "epoch": 0.286625, + "grad_norm": 2.40625, + "grad_norm_var": 0.009859212239583333, + "learning_rate": 0.0001, + "loss": 7.312, + "loss/crossentropy": 2.2216570377349854, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2155969738960266, + "step": 4586 + }, + { + "epoch": 0.28675, + "grad_norm": 2.421875, + "grad_norm_var": 0.009943644205729166, + "learning_rate": 0.0001, + "loss": 7.5092, + "loss/crossentropy": 2.213111639022827, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2212653011083603, + "step": 4588 + }, + { + "epoch": 0.286875, + "grad_norm": 2.109375, + "grad_norm_var": 0.013362630208333334, + "learning_rate": 0.0001, + "loss": 7.3797, + "loss/crossentropy": 2.257219433784485, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22302033007144928, + "step": 4590 + }, + { + "epoch": 0.287, + "grad_norm": 2.609375, + "grad_norm_var": 0.018452962239583332, + "learning_rate": 0.0001, + "loss": 7.3395, + "loss/crossentropy": 2.374780535697937, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2257033884525299, + "step": 4592 + }, + { + "epoch": 0.287125, + "grad_norm": 2.40625, + "grad_norm_var": 0.021761067708333335, + "learning_rate": 0.0001, + "loss": 7.2915, + "loss/crossentropy": 2.033597767353058, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.22485361993312836, + "step": 4594 + }, + { + "epoch": 0.28725, + "grad_norm": 2.171875, + "grad_norm_var": 0.022591145833333333, + "learning_rate": 0.0001, + "loss": 7.4737, + "loss/crossentropy": 2.3573769330978394, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2300783023238182, + "step": 4596 + }, + { + "epoch": 0.287375, + "grad_norm": 2.28125, + "grad_norm_var": 0.0231109619140625, + "learning_rate": 0.0001, + "loss": 7.3957, + "loss/crossentropy": 2.3102123737335205, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22356858849525452, + "step": 4598 + }, + { + "epoch": 0.2875, + "grad_norm": 2.140625, + "grad_norm_var": 0.02281494140625, + "learning_rate": 0.0001, + "loss": 7.2862, + "loss/crossentropy": 2.392950177192688, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21436259150505066, + "step": 4600 + }, + { + "epoch": 0.287625, + "grad_norm": 2.515625, + "grad_norm_var": 0.0263824462890625, + "learning_rate": 0.0001, + "loss": 7.3518, + "loss/crossentropy": 2.2781176567077637, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23186073452234268, + "step": 4602 + }, + { + "epoch": 0.28775, + "grad_norm": 2.234375, + "grad_norm_var": 0.0295074462890625, + "learning_rate": 0.0001, + "loss": 7.3298, + "loss/crossentropy": 2.2567808628082275, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2299715131521225, + "step": 4604 + }, + { + "epoch": 0.287875, + "grad_norm": 2.171875, + "grad_norm_var": 0.0271392822265625, + "learning_rate": 0.0001, + "loss": 7.3105, + "loss/crossentropy": 1.9985857605934143, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21828313171863556, + "step": 4606 + }, + { + "epoch": 0.288, + "grad_norm": 2.3125, + "grad_norm_var": 0.022749837239583334, + "learning_rate": 0.0001, + "loss": 7.4778, + "loss/crossentropy": 2.2161970138549805, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2215772345662117, + "step": 4608 + }, + { + "epoch": 0.288125, + "grad_norm": 2.21875, + "grad_norm_var": 0.01998291015625, + "learning_rate": 0.0001, + "loss": 7.2087, + "loss/crossentropy": 1.9949069619178772, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.1976795345544815, + "step": 4610 + }, + { + "epoch": 0.28825, + "grad_norm": 2.296875, + "grad_norm_var": 0.0176422119140625, + "learning_rate": 0.0001, + "loss": 7.3442, + "loss/crossentropy": 2.1542601585388184, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20314577966928482, + "step": 4612 + }, + { + "epoch": 0.288375, + "grad_norm": 2.359375, + "grad_norm_var": 0.017822265625, + "learning_rate": 0.0001, + "loss": 7.4219, + "loss/crossentropy": 2.1769785284996033, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22160416841506958, + "step": 4614 + }, + { + "epoch": 0.2885, + "grad_norm": 2.265625, + "grad_norm_var": 0.016673787434895834, + "learning_rate": 0.0001, + "loss": 7.4635, + "loss/crossentropy": 2.079149842262268, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21963858604431152, + "step": 4616 + }, + { + "epoch": 0.288625, + "grad_norm": 2.265625, + "grad_norm_var": 0.012743123372395833, + "learning_rate": 0.0001, + "loss": 7.4368, + "loss/crossentropy": 2.1562063694000244, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20194550603628159, + "step": 4618 + }, + { + "epoch": 0.28875, + "grad_norm": 2.234375, + "grad_norm_var": 0.0077626546223958336, + "learning_rate": 0.0001, + "loss": 7.3215, + "loss/crossentropy": 2.285536289215088, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.23264925926923752, + "step": 4620 + }, + { + "epoch": 0.288875, + "grad_norm": 2.265625, + "grad_norm_var": 0.010933430989583333, + "learning_rate": 0.0001, + "loss": 7.5954, + "loss/crossentropy": 2.383559465408325, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21055196970701218, + "step": 4622 + }, + { + "epoch": 0.289, + "grad_norm": 2.15625, + "grad_norm_var": 0.01123046875, + "learning_rate": 0.0001, + "loss": 7.3199, + "loss/crossentropy": 2.09514844417572, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21392296999692917, + "step": 4624 + }, + { + "epoch": 0.289125, + "grad_norm": 2.46875, + "grad_norm_var": 0.012279256184895834, + "learning_rate": 0.0001, + "loss": 7.4886, + "loss/crossentropy": 2.366590738296509, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2233487293124199, + "step": 4626 + }, + { + "epoch": 0.28925, + "grad_norm": 2.375, + "grad_norm_var": 0.0121246337890625, + "learning_rate": 0.0001, + "loss": 7.627, + "loss/crossentropy": 2.5399746894836426, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22708307206630707, + "step": 4628 + }, + { + "epoch": 0.289375, + "grad_norm": 2.390625, + "grad_norm_var": 0.014286295572916666, + "learning_rate": 0.0001, + "loss": 7.3466, + "loss/crossentropy": 2.181055188179016, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21900714933872223, + "step": 4630 + }, + { + "epoch": 0.2895, + "grad_norm": 2.4375, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 7.3034, + "loss/crossentropy": 2.297792911529541, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2245972827076912, + "step": 4632 + }, + { + "epoch": 0.289625, + "grad_norm": 2.3125, + "grad_norm_var": 0.012946573893229167, + "learning_rate": 0.0001, + "loss": 7.3323, + "loss/crossentropy": 2.329147696495056, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.22863604873418808, + "step": 4634 + }, + { + "epoch": 0.28975, + "grad_norm": 2.703125, + "grad_norm_var": 0.23771158854166666, + "learning_rate": 0.0001, + "loss": 7.3751, + "loss/crossentropy": 2.1996554136276245, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21396170556545258, + "step": 4636 + }, + { + "epoch": 0.289875, + "grad_norm": 2.390625, + "grad_norm_var": 0.23672587076822918, + "learning_rate": 0.0001, + "loss": 7.4831, + "loss/crossentropy": 2.5707833766937256, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23703519254922867, + "step": 4638 + }, + { + "epoch": 0.29, + "grad_norm": 2.5, + "grad_norm_var": 0.22306315104166666, + "learning_rate": 0.0001, + "loss": 7.4067, + "loss/crossentropy": 2.0104441046714783, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22016918659210205, + "step": 4640 + }, + { + "epoch": 0.290125, + "grad_norm": 3.0, + "grad_norm_var": 0.23544514973958333, + "learning_rate": 0.0001, + "loss": 7.3453, + "loss/crossentropy": 2.236189603805542, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22123342752456665, + "step": 4642 + }, + { + "epoch": 0.29025, + "grad_norm": 1.953125, + "grad_norm_var": 0.2549641927083333, + "learning_rate": 0.0001, + "loss": 7.1814, + "loss/crossentropy": 2.1541532278060913, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22347942739725113, + "step": 4644 + }, + { + "epoch": 0.290375, + "grad_norm": 2.21875, + "grad_norm_var": 0.2615559895833333, + "learning_rate": 0.0001, + "loss": 7.2884, + "loss/crossentropy": 2.274155378341675, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23323418200016022, + "step": 4646 + }, + { + "epoch": 0.2905, + "grad_norm": 2.4375, + "grad_norm_var": 0.26042378743489586, + "learning_rate": 0.0001, + "loss": 7.4791, + "loss/crossentropy": 2.08142626285553, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2270170971751213, + "step": 4648 + }, + { + "epoch": 0.290625, + "grad_norm": 2.171875, + "grad_norm_var": 0.2713368733723958, + "learning_rate": 0.0001, + "loss": 7.4897, + "loss/crossentropy": 2.15802264213562, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.20108021795749664, + "step": 4650 + }, + { + "epoch": 0.29075, + "grad_norm": 2.5625, + "grad_norm_var": 0.06603190104166666, + "learning_rate": 0.0001, + "loss": 7.5383, + "loss/crossentropy": 2.215345621109009, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2371804639697075, + "step": 4652 + }, + { + "epoch": 0.290875, + "grad_norm": 2.5, + "grad_norm_var": 0.07634989420572917, + "learning_rate": 0.0001, + "loss": 7.274, + "loss/crossentropy": 2.2180492281913757, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21646256744861603, + "step": 4654 + }, + { + "epoch": 0.291, + "grad_norm": 2.296875, + "grad_norm_var": 0.07224019368489583, + "learning_rate": 0.0001, + "loss": 7.3643, + "loss/crossentropy": 2.1802927255630493, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2141588106751442, + "step": 4656 + }, + { + "epoch": 0.291125, + "grad_norm": 2.328125, + "grad_norm_var": 0.04449462890625, + "learning_rate": 0.0001, + "loss": 7.649, + "loss/crossentropy": 2.5158231258392334, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.243204727768898, + "step": 4658 + }, + { + "epoch": 0.29125, + "grad_norm": 2.328125, + "grad_norm_var": 0.0333892822265625, + "learning_rate": 0.0001, + "loss": 7.463, + "loss/crossentropy": 2.190263271331787, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21288666874170303, + "step": 4660 + }, + { + "epoch": 0.291375, + "grad_norm": 2.578125, + "grad_norm_var": 0.0307281494140625, + "learning_rate": 0.0001, + "loss": 7.4629, + "loss/crossentropy": 2.5436954498291016, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2295963317155838, + "step": 4662 + }, + { + "epoch": 0.2915, + "grad_norm": 2.390625, + "grad_norm_var": 0.031201171875, + "learning_rate": 0.0001, + "loss": 7.5307, + "loss/crossentropy": 2.388418436050415, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.21847142279148102, + "step": 4664 + }, + { + "epoch": 0.291625, + "grad_norm": 2.3125, + "grad_norm_var": 0.02662353515625, + "learning_rate": 0.0001, + "loss": 7.4585, + "loss/crossentropy": 2.4331501722335815, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2292798087000847, + "step": 4666 + }, + { + "epoch": 0.29175, + "grad_norm": 2.265625, + "grad_norm_var": 0.027144368489583334, + "learning_rate": 0.0001, + "loss": 7.2474, + "loss/crossentropy": 2.1129366755485535, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.20572030544281006, + "step": 4668 + }, + { + "epoch": 0.291875, + "grad_norm": 2.390625, + "grad_norm_var": 0.010789998372395833, + "learning_rate": 0.0001, + "loss": 7.5378, + "loss/crossentropy": 2.058245360851288, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20938818156719208, + "step": 4670 + }, + { + "epoch": 0.292, + "grad_norm": 2.4375, + "grad_norm_var": 0.010204060872395834, + "learning_rate": 0.0001, + "loss": 7.5196, + "loss/crossentropy": 2.34469473361969, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2339082881808281, + "step": 4672 + }, + { + "epoch": 0.292125, + "grad_norm": 2.46875, + "grad_norm_var": 0.0115631103515625, + "learning_rate": 0.0001, + "loss": 7.2852, + "loss/crossentropy": 2.3817514181137085, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21657159179449081, + "step": 4674 + }, + { + "epoch": 0.29225, + "grad_norm": 2.125, + "grad_norm_var": 0.016988118489583332, + "learning_rate": 0.0001, + "loss": 7.3871, + "loss/crossentropy": 2.382703423500061, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2450731173157692, + "step": 4676 + }, + { + "epoch": 0.292375, + "grad_norm": 2.25, + "grad_norm_var": 0.014615885416666667, + "learning_rate": 0.0001, + "loss": 7.4901, + "loss/crossentropy": 2.296135663986206, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.26525241136550903, + "step": 4678 + }, + { + "epoch": 0.2925, + "grad_norm": 2.3125, + "grad_norm_var": 0.012093098958333333, + "learning_rate": 0.0001, + "loss": 7.4537, + "loss/crossentropy": 2.2322874069213867, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22614771127700806, + "step": 4680 + }, + { + "epoch": 0.292625, + "grad_norm": 2.375, + "grad_norm_var": 0.014232381184895834, + "learning_rate": 0.0001, + "loss": 7.3101, + "loss/crossentropy": 2.408819079399109, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21294642984867096, + "step": 4682 + }, + { + "epoch": 0.29275, + "grad_norm": 2.328125, + "grad_norm_var": 0.013434855143229167, + "learning_rate": 0.0001, + "loss": 7.3394, + "loss/crossentropy": 2.1100034713745117, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22328343987464905, + "step": 4684 + }, + { + "epoch": 0.292875, + "grad_norm": 2.390625, + "grad_norm_var": 0.0124664306640625, + "learning_rate": 0.0001, + "loss": 7.1731, + "loss/crossentropy": 1.996330440044403, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.18464084714651108, + "step": 4686 + }, + { + "epoch": 0.293, + "grad_norm": 2.890625, + "grad_norm_var": 0.035481770833333336, + "learning_rate": 0.0001, + "loss": 7.199, + "loss/crossentropy": 2.1880674958229065, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2431424781680107, + "step": 4688 + }, + { + "epoch": 0.293125, + "grad_norm": 2.109375, + "grad_norm_var": 0.0387847900390625, + "learning_rate": 0.0001, + "loss": 7.2796, + "loss/crossentropy": 2.326804280281067, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.21799682080745697, + "step": 4690 + }, + { + "epoch": 0.29325, + "grad_norm": 2.359375, + "grad_norm_var": 0.033524576822916666, + "learning_rate": 0.0001, + "loss": 7.5501, + "loss/crossentropy": 2.309473156929016, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22959060966968536, + "step": 4692 + }, + { + "epoch": 0.293375, + "grad_norm": 2.421875, + "grad_norm_var": 0.03308817545572917, + "learning_rate": 0.0001, + "loss": 7.4887, + "loss/crossentropy": 2.607773542404175, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.23161280155181885, + "step": 4694 + }, + { + "epoch": 0.2935, + "grad_norm": 2.265625, + "grad_norm_var": 0.034333292643229166, + "learning_rate": 0.0001, + "loss": 7.1605, + "loss/crossentropy": 2.140324115753174, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.24282102286815643, + "step": 4696 + }, + { + "epoch": 0.293625, + "grad_norm": 2.359375, + "grad_norm_var": 0.03413798014322917, + "learning_rate": 0.0001, + "loss": 7.4359, + "loss/crossentropy": 2.1101192831993103, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2048392966389656, + "step": 4698 + }, + { + "epoch": 0.29375, + "grad_norm": 2.0625, + "grad_norm_var": 0.040185546875, + "learning_rate": 0.0001, + "loss": 7.3764, + "loss/crossentropy": 2.3710379600524902, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21764564514160156, + "step": 4700 + }, + { + "epoch": 0.293875, + "grad_norm": 2.34375, + "grad_norm_var": 0.03961181640625, + "learning_rate": 0.0001, + "loss": 7.4356, + "loss/crossentropy": 2.3584975004196167, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2174394130706787, + "step": 4702 + }, + { + "epoch": 0.294, + "grad_norm": 2.296875, + "grad_norm_var": 0.014957682291666666, + "learning_rate": 0.0001, + "loss": 7.4147, + "loss/crossentropy": 2.118413209915161, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.19888906925916672, + "step": 4704 + }, + { + "epoch": 0.294125, + "grad_norm": 2.359375, + "grad_norm_var": 0.0112945556640625, + "learning_rate": 0.0001, + "loss": 7.3895, + "loss/crossentropy": 2.2657341957092285, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.23323117196559906, + "step": 4706 + }, + { + "epoch": 0.29425, + "grad_norm": 2.28125, + "grad_norm_var": 0.012328084309895833, + "learning_rate": 0.0001, + "loss": 7.2587, + "loss/crossentropy": 2.2151424884796143, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20538458228111267, + "step": 4708 + }, + { + "epoch": 0.294375, + "grad_norm": 2.4375, + "grad_norm_var": 0.010152180989583334, + "learning_rate": 0.0001, + "loss": 7.4708, + "loss/crossentropy": 2.5433419942855835, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.26616372913122177, + "step": 4710 + }, + { + "epoch": 0.2945, + "grad_norm": 2.421875, + "grad_norm_var": 0.0125396728515625, + "learning_rate": 0.0001, + "loss": 7.2729, + "loss/crossentropy": 2.154632091522217, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2053903043270111, + "step": 4712 + }, + { + "epoch": 0.294625, + "grad_norm": 2.28125, + "grad_norm_var": 0.01275634765625, + "learning_rate": 0.0001, + "loss": 7.4455, + "loss/crossentropy": 2.3647114038467407, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22039730101823807, + "step": 4714 + }, + { + "epoch": 0.29475, + "grad_norm": 2.171875, + "grad_norm_var": 0.0086822509765625, + "learning_rate": 0.0001, + "loss": 7.2138, + "loss/crossentropy": 2.1929028034210205, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21555405855178833, + "step": 4716 + }, + { + "epoch": 0.294875, + "grad_norm": 2.421875, + "grad_norm_var": 0.0091796875, + "learning_rate": 0.0001, + "loss": 7.36, + "loss/crossentropy": 2.084302306175232, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21005475521087646, + "step": 4718 + }, + { + "epoch": 0.295, + "grad_norm": 2.25, + "grad_norm_var": 0.009765625, + "learning_rate": 0.0001, + "loss": 7.3237, + "loss/crossentropy": 2.346268892288208, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22263453155755997, + "step": 4720 + }, + { + "epoch": 0.295125, + "grad_norm": 2.203125, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 7.2444, + "loss/crossentropy": 2.268470883369446, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2138143628835678, + "step": 4722 + }, + { + "epoch": 0.29525, + "grad_norm": 2.703125, + "grad_norm_var": 0.0237945556640625, + "learning_rate": 0.0001, + "loss": 7.6037, + "loss/crossentropy": 2.326021194458008, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2048988789319992, + "step": 4724 + }, + { + "epoch": 0.295375, + "grad_norm": 2.234375, + "grad_norm_var": 0.024811808268229166, + "learning_rate": 0.0001, + "loss": 7.3422, + "loss/crossentropy": 2.193223237991333, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2141074314713478, + "step": 4726 + }, + { + "epoch": 0.2955, + "grad_norm": 2.203125, + "grad_norm_var": 0.0246978759765625, + "learning_rate": 0.0001, + "loss": 7.4967, + "loss/crossentropy": 2.308136224746704, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21569503098726273, + "step": 4728 + }, + { + "epoch": 0.295625, + "grad_norm": 2.203125, + "grad_norm_var": 0.0247711181640625, + "learning_rate": 0.0001, + "loss": 7.3277, + "loss/crossentropy": 1.9237273931503296, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.1951095387339592, + "step": 4730 + }, + { + "epoch": 0.29575, + "grad_norm": 2.40625, + "grad_norm_var": 0.0256988525390625, + "learning_rate": 0.0001, + "loss": 7.3125, + "loss/crossentropy": 2.059843420982361, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21607337892055511, + "step": 4732 + }, + { + "epoch": 0.295875, + "grad_norm": 2.578125, + "grad_norm_var": 0.030598958333333332, + "learning_rate": 0.0001, + "loss": 7.3797, + "loss/crossentropy": 2.239774227142334, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.24183505028486252, + "step": 4734 + }, + { + "epoch": 0.296, + "grad_norm": 2.359375, + "grad_norm_var": 0.030663045247395833, + "learning_rate": 0.0001, + "loss": 7.4556, + "loss/crossentropy": 2.37227463722229, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23292560875415802, + "step": 4736 + }, + { + "epoch": 0.296125, + "grad_norm": 2.21875, + "grad_norm_var": 0.0251861572265625, + "learning_rate": 0.0001, + "loss": 7.304, + "loss/crossentropy": 2.0044411420822144, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19409935921430588, + "step": 4738 + }, + { + "epoch": 0.29625, + "grad_norm": 2.234375, + "grad_norm_var": 0.018094889322916665, + "learning_rate": 0.0001, + "loss": 7.3915, + "loss/crossentropy": 2.3123366832733154, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2075807899236679, + "step": 4740 + }, + { + "epoch": 0.296375, + "grad_norm": 2.3125, + "grad_norm_var": 0.0167388916015625, + "learning_rate": 0.0001, + "loss": 7.3113, + "loss/crossentropy": 1.761619508266449, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.22059950977563858, + "step": 4742 + }, + { + "epoch": 0.2965, + "grad_norm": 2.5, + "grad_norm_var": 0.016584269205729165, + "learning_rate": 0.0001, + "loss": 7.5001, + "loss/crossentropy": 2.106764078140259, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22241321206092834, + "step": 4744 + }, + { + "epoch": 0.296625, + "grad_norm": 2.28125, + "grad_norm_var": 0.0156646728515625, + "learning_rate": 0.0001, + "loss": 7.4221, + "loss/crossentropy": 2.1658183336257935, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2201010212302208, + "step": 4746 + }, + { + "epoch": 0.29675, + "grad_norm": 2.21875, + "grad_norm_var": 0.014216105143229166, + "learning_rate": 0.0001, + "loss": 7.2877, + "loss/crossentropy": 2.288554072380066, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21139442175626755, + "step": 4748 + }, + { + "epoch": 0.296875, + "grad_norm": 2.1875, + "grad_norm_var": 0.012132771809895833, + "learning_rate": 0.0001, + "loss": 7.291, + "loss/crossentropy": 2.209873080253601, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23227518796920776, + "step": 4750 + }, + { + "epoch": 0.297, + "grad_norm": 2.21875, + "grad_norm_var": 0.011668904622395834, + "learning_rate": 0.0001, + "loss": 7.3108, + "loss/crossentropy": 2.2278741598129272, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.213672935962677, + "step": 4752 + }, + { + "epoch": 0.297125, + "grad_norm": 2.546875, + "grad_norm_var": 0.014351399739583333, + "learning_rate": 0.0001, + "loss": 7.4632, + "loss/crossentropy": 2.1707485914230347, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21832922846078873, + "step": 4754 + }, + { + "epoch": 0.29725, + "grad_norm": 2.21875, + "grad_norm_var": 0.013277180989583333, + "learning_rate": 0.0001, + "loss": 7.3793, + "loss/crossentropy": 2.068936765193939, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21937869489192963, + "step": 4756 + }, + { + "epoch": 0.297375, + "grad_norm": 2.5625, + "grad_norm_var": 0.0177642822265625, + "learning_rate": 0.0001, + "loss": 7.4439, + "loss/crossentropy": 2.4402244091033936, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2264351323246956, + "step": 4758 + }, + { + "epoch": 0.2975, + "grad_norm": 2.53125, + "grad_norm_var": 0.029377237955729166, + "learning_rate": 0.0001, + "loss": 7.4963, + "loss/crossentropy": 2.351226568222046, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23098966479301453, + "step": 4760 + }, + { + "epoch": 0.297625, + "grad_norm": 2.140625, + "grad_norm_var": 0.03259989420572917, + "learning_rate": 0.0001, + "loss": 7.392, + "loss/crossentropy": 1.9955863952636719, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21300213038921356, + "step": 4762 + }, + { + "epoch": 0.29775, + "grad_norm": 2.390625, + "grad_norm_var": 0.03186848958333333, + "learning_rate": 0.0001, + "loss": 7.5055, + "loss/crossentropy": 2.4948599338531494, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2379990816116333, + "step": 4764 + }, + { + "epoch": 0.297875, + "grad_norm": 2.359375, + "grad_norm_var": 0.0257720947265625, + "learning_rate": 0.0001, + "loss": 7.3315, + "loss/crossentropy": 1.780558168888092, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19055156409740448, + "step": 4766 + }, + { + "epoch": 0.298, + "grad_norm": 2.359375, + "grad_norm_var": 0.0232086181640625, + "learning_rate": 0.0001, + "loss": 7.4719, + "loss/crossentropy": 2.25586998462677, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2168411985039711, + "step": 4768 + }, + { + "epoch": 0.298125, + "grad_norm": 2.296875, + "grad_norm_var": 0.021703084309895832, + "learning_rate": 0.0001, + "loss": 7.3451, + "loss/crossentropy": 2.1937352418899536, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21270034462213516, + "step": 4770 + }, + { + "epoch": 0.29825, + "grad_norm": 2.296875, + "grad_norm_var": 0.019123331705729166, + "learning_rate": 0.0001, + "loss": 7.4653, + "loss/crossentropy": 2.2712541818618774, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23585833609104156, + "step": 4772 + }, + { + "epoch": 0.298375, + "grad_norm": 2.453125, + "grad_norm_var": 0.0224609375, + "learning_rate": 0.0001, + "loss": 7.392, + "loss/crossentropy": 2.311362624168396, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21948038041591644, + "step": 4774 + }, + { + "epoch": 0.2985, + "grad_norm": 2.296875, + "grad_norm_var": 0.010936482747395834, + "learning_rate": 0.0001, + "loss": 7.5252, + "loss/crossentropy": 2.5121726989746094, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23548902571201324, + "step": 4776 + }, + { + "epoch": 0.298625, + "grad_norm": 2.28125, + "grad_norm_var": 0.010725911458333333, + "learning_rate": 0.0001, + "loss": 7.4346, + "loss/crossentropy": 2.352226734161377, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2144220471382141, + "step": 4778 + }, + { + "epoch": 0.29875, + "grad_norm": 2.34375, + "grad_norm_var": 0.0104156494140625, + "learning_rate": 0.0001, + "loss": 7.3003, + "loss/crossentropy": 1.966825008392334, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20474375039339066, + "step": 4780 + }, + { + "epoch": 0.298875, + "grad_norm": 2.171875, + "grad_norm_var": 0.01158447265625, + "learning_rate": 0.0001, + "loss": 7.2383, + "loss/crossentropy": 2.2786693572998047, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22437963634729385, + "step": 4782 + }, + { + "epoch": 0.299, + "grad_norm": 2.125, + "grad_norm_var": 0.012821451822916666, + "learning_rate": 0.0001, + "loss": 7.314, + "loss/crossentropy": 2.1766642332077026, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.21804757416248322, + "step": 4784 + }, + { + "epoch": 0.299125, + "grad_norm": 2.203125, + "grad_norm_var": 0.0131988525390625, + "learning_rate": 0.0001, + "loss": 7.3039, + "loss/crossentropy": 2.009239912033081, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.19700751453638077, + "step": 4786 + }, + { + "epoch": 0.29925, + "grad_norm": 3.109375, + "grad_norm_var": 0.0577545166015625, + "learning_rate": 0.0001, + "loss": 7.3357, + "loss/crossentropy": 2.127845048904419, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21124648302793503, + "step": 4788 + }, + { + "epoch": 0.299375, + "grad_norm": 2.265625, + "grad_norm_var": 0.0581695556640625, + "learning_rate": 0.0001, + "loss": 7.3315, + "loss/crossentropy": 2.303865075111389, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22746270895004272, + "step": 4790 + }, + { + "epoch": 0.2995, + "grad_norm": 2.375, + "grad_norm_var": 0.058984375, + "learning_rate": 0.0001, + "loss": 7.3872, + "loss/crossentropy": 2.15032958984375, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21064457297325134, + "step": 4792 + }, + { + "epoch": 0.299625, + "grad_norm": 2.5625, + "grad_norm_var": 0.059544881184895836, + "learning_rate": 0.0001, + "loss": 7.4218, + "loss/crossentropy": 1.9902217388153076, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275134027004242, + "step": 4794 + }, + { + "epoch": 0.29975, + "grad_norm": 2.296875, + "grad_norm_var": 0.061421712239583336, + "learning_rate": 0.0001, + "loss": 7.3493, + "loss/crossentropy": 2.365830898284912, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22473762929439545, + "step": 4796 + }, + { + "epoch": 0.299875, + "grad_norm": 2.25, + "grad_norm_var": 0.061253865559895836, + "learning_rate": 0.0001, + "loss": 7.2287, + "loss/crossentropy": 2.406466007232666, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2222784459590912, + "step": 4798 + }, + { + "epoch": 0.3, + "grad_norm": 2.0625, + "grad_norm_var": 0.06142578125, + "learning_rate": 0.0001, + "loss": 7.4167, + "loss/crossentropy": 2.2407450675964355, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2090248540043831, + "step": 4800 + }, + { + "epoch": 0.300125, + "grad_norm": 2.46875, + "grad_norm_var": 0.05963134765625, + "learning_rate": 0.0001, + "loss": 7.3583, + "loss/crossentropy": 2.3130555152893066, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21969569474458694, + "step": 4802 + }, + { + "epoch": 0.30025, + "grad_norm": 2.125, + "grad_norm_var": 0.0265625, + "learning_rate": 0.0001, + "loss": 7.2996, + "loss/crossentropy": 2.301561713218689, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21655967831611633, + "step": 4804 + }, + { + "epoch": 0.300375, + "grad_norm": 2.421875, + "grad_norm_var": 0.02265625, + "learning_rate": 0.0001, + "loss": 7.3977, + "loss/crossentropy": 2.4715986251831055, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22068122029304504, + "step": 4806 + }, + { + "epoch": 0.3005, + "grad_norm": 2.578125, + "grad_norm_var": 0.0252838134765625, + "learning_rate": 0.0001, + "loss": 7.3491, + "loss/crossentropy": 2.0583669543266296, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2092050388455391, + "step": 4808 + }, + { + "epoch": 0.300625, + "grad_norm": 2.078125, + "grad_norm_var": 0.026447550455729166, + "learning_rate": 0.0001, + "loss": 7.4016, + "loss/crossentropy": 2.222812056541443, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.20580071210861206, + "step": 4810 + }, + { + "epoch": 0.30075, + "grad_norm": 2.515625, + "grad_norm_var": 0.027692667643229165, + "learning_rate": 0.0001, + "loss": 7.3078, + "loss/crossentropy": 2.467105746269226, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21557529270648956, + "step": 4812 + }, + { + "epoch": 0.300875, + "grad_norm": 2.25, + "grad_norm_var": 0.029719034830729168, + "learning_rate": 0.0001, + "loss": 7.4463, + "loss/crossentropy": 2.0972548127174377, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.22322270274162292, + "step": 4814 + }, + { + "epoch": 0.301, + "grad_norm": 2.125, + "grad_norm_var": 0.0301910400390625, + "learning_rate": 0.0001, + "loss": 7.3009, + "loss/crossentropy": 2.113277792930603, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21659619361162186, + "step": 4816 + }, + { + "epoch": 0.301125, + "grad_norm": 2.296875, + "grad_norm_var": 0.02880859375, + "learning_rate": 0.0001, + "loss": 7.3721, + "loss/crossentropy": 2.483347177505493, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2171446457505226, + "step": 4818 + }, + { + "epoch": 0.30125, + "grad_norm": 2.265625, + "grad_norm_var": 0.026927693684895834, + "learning_rate": 0.0001, + "loss": 7.3889, + "loss/crossentropy": 2.417271375656128, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22507987171411514, + "step": 4820 + }, + { + "epoch": 0.301375, + "grad_norm": 2.375, + "grad_norm_var": 0.02681884765625, + "learning_rate": 0.0001, + "loss": 7.4797, + "loss/crossentropy": 2.3490647077560425, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2174646332859993, + "step": 4822 + }, + { + "epoch": 0.3015, + "grad_norm": 2.359375, + "grad_norm_var": 0.020164998372395833, + "learning_rate": 0.0001, + "loss": 7.5363, + "loss/crossentropy": 2.400337815284729, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2227081060409546, + "step": 4824 + }, + { + "epoch": 0.301625, + "grad_norm": 3.0, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 7.4569, + "loss/crossentropy": 2.2321704030036926, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.24002033472061157, + "step": 4826 + }, + { + "epoch": 0.30175, + "grad_norm": 2.359375, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 0.0001, + "loss": 7.5359, + "loss/crossentropy": 2.3871182203292847, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20659293234348297, + "step": 4828 + }, + { + "epoch": 0.301875, + "grad_norm": 3.28125, + "grad_norm_var": 0.09729410807291666, + "learning_rate": 0.0001, + "loss": 7.4514, + "loss/crossentropy": 2.4146281480789185, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22511692345142365, + "step": 4830 + }, + { + "epoch": 0.302, + "grad_norm": 2.765625, + "grad_norm_var": 0.09411519368489583, + "learning_rate": 0.0001, + "loss": 7.3884, + "loss/crossentropy": 2.3637466430664062, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20597800612449646, + "step": 4832 + }, + { + "epoch": 0.302125, + "grad_norm": 2.0625, + "grad_norm_var": 0.10143229166666666, + "learning_rate": 0.0001, + "loss": 7.4427, + "loss/crossentropy": 2.363954782485962, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21014075726270676, + "step": 4834 + }, + { + "epoch": 0.30225, + "grad_norm": 2.390625, + "grad_norm_var": 0.0969146728515625, + "learning_rate": 0.0001, + "loss": 7.2434, + "loss/crossentropy": 2.245696544647217, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20109941810369492, + "step": 4836 + }, + { + "epoch": 0.302375, + "grad_norm": 2.421875, + "grad_norm_var": 0.09761962890625, + "learning_rate": 0.0001, + "loss": 7.3634, + "loss/crossentropy": 2.1231839656829834, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2174079716205597, + "step": 4838 + }, + { + "epoch": 0.3025, + "grad_norm": 2.15625, + "grad_norm_var": 0.11103108723958334, + "learning_rate": 0.0001, + "loss": 7.2504, + "loss/crossentropy": 2.1823160648345947, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21924307942390442, + "step": 4840 + }, + { + "epoch": 0.302625, + "grad_norm": 2.40625, + "grad_norm_var": 0.08931884765625, + "learning_rate": 0.0001, + "loss": 7.4531, + "loss/crossentropy": 2.3826904296875, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.21867043524980545, + "step": 4842 + }, + { + "epoch": 0.30275, + "grad_norm": 2.4375, + "grad_norm_var": 0.08771158854166666, + "learning_rate": 0.0001, + "loss": 7.3825, + "loss/crossentropy": 2.2450802326202393, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196601778268814, + "step": 4844 + }, + { + "epoch": 0.302875, + "grad_norm": 2.65625, + "grad_norm_var": 0.03821207682291667, + "learning_rate": 0.0001, + "loss": 7.5146, + "loss/crossentropy": 2.1403591632843018, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2133881226181984, + "step": 4846 + }, + { + "epoch": 0.303, + "grad_norm": 2.25, + "grad_norm_var": 0.06558329264322917, + "learning_rate": 0.0001, + "loss": 7.435, + "loss/crossentropy": 2.14195853471756, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.19571785628795624, + "step": 4848 + }, + { + "epoch": 0.303125, + "grad_norm": 2.5625, + "grad_norm_var": 0.060221354166666664, + "learning_rate": 0.0001, + "loss": 7.4894, + "loss/crossentropy": 2.2988877296447754, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.22777371853590012, + "step": 4850 + }, + { + "epoch": 0.30325, + "grad_norm": 2.203125, + "grad_norm_var": 0.06297098795572917, + "learning_rate": 0.0001, + "loss": 7.3589, + "loss/crossentropy": 2.390594482421875, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2120780646800995, + "step": 4852 + }, + { + "epoch": 0.303375, + "grad_norm": 2.328125, + "grad_norm_var": 0.06382548014322917, + "learning_rate": 0.0001, + "loss": 7.4284, + "loss/crossentropy": 2.379095196723938, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20808186382055283, + "step": 4854 + }, + { + "epoch": 0.3035, + "grad_norm": 2.375, + "grad_norm_var": 0.0565093994140625, + "learning_rate": 0.0001, + "loss": 7.4303, + "loss/crossentropy": 2.398724317550659, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22028641402721405, + "step": 4856 + }, + { + "epoch": 0.303625, + "grad_norm": 2.546875, + "grad_norm_var": 0.0544097900390625, + "learning_rate": 0.0001, + "loss": 7.5402, + "loss/crossentropy": 2.4652938842773438, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23258233815431595, + "step": 4858 + }, + { + "epoch": 0.30375, + "grad_norm": 2.375, + "grad_norm_var": 0.0521148681640625, + "learning_rate": 0.0001, + "loss": 7.5154, + "loss/crossentropy": 2.15559184551239, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2242182418704033, + "step": 4860 + }, + { + "epoch": 0.303875, + "grad_norm": 2.328125, + "grad_norm_var": 0.04914449055989583, + "learning_rate": 0.0001, + "loss": 7.516, + "loss/crossentropy": 2.3568320274353027, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22093456983566284, + "step": 4862 + }, + { + "epoch": 0.304, + "grad_norm": 2.359375, + "grad_norm_var": 0.018382771809895834, + "learning_rate": 0.0001, + "loss": 7.303, + "loss/crossentropy": 2.130435824394226, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20797330886125565, + "step": 4864 + }, + { + "epoch": 0.304125, + "grad_norm": 2.46875, + "grad_norm_var": 0.017659505208333332, + "learning_rate": 0.0001, + "loss": 7.5087, + "loss/crossentropy": 2.1923974752426147, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2099837362766266, + "step": 4866 + }, + { + "epoch": 0.30425, + "grad_norm": 2.203125, + "grad_norm_var": 0.018993123372395834, + "learning_rate": 0.0001, + "loss": 7.5138, + "loss/crossentropy": 2.3074164986610413, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21502834558486938, + "step": 4868 + }, + { + "epoch": 0.304375, + "grad_norm": 2.609375, + "grad_norm_var": 0.023274739583333332, + "learning_rate": 0.0001, + "loss": 7.6134, + "loss/crossentropy": 2.399793028831482, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2250315099954605, + "step": 4870 + }, + { + "epoch": 0.3045, + "grad_norm": 2.203125, + "grad_norm_var": 0.021906534830729168, + "learning_rate": 0.0001, + "loss": 7.4322, + "loss/crossentropy": 2.5476996898651123, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.21240823715925217, + "step": 4872 + }, + { + "epoch": 0.304625, + "grad_norm": 2.1875, + "grad_norm_var": 0.022847493489583332, + "learning_rate": 0.0001, + "loss": 7.3043, + "loss/crossentropy": 2.406996250152588, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21996094286441803, + "step": 4874 + }, + { + "epoch": 0.30475, + "grad_norm": 2.140625, + "grad_norm_var": 0.02496337890625, + "learning_rate": 0.0001, + "loss": 7.2396, + "loss/crossentropy": 2.1408446431159973, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.212337464094162, + "step": 4876 + }, + { + "epoch": 0.304875, + "grad_norm": 2.3125, + "grad_norm_var": 0.022932942708333334, + "learning_rate": 0.0001, + "loss": 7.3172, + "loss/crossentropy": 2.0700973868370056, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21268072724342346, + "step": 4878 + }, + { + "epoch": 0.305, + "grad_norm": 2.078125, + "grad_norm_var": 0.023412068684895832, + "learning_rate": 0.0001, + "loss": 7.3727, + "loss/crossentropy": 2.317967474460602, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22330276668071747, + "step": 4880 + }, + { + "epoch": 0.305125, + "grad_norm": 2.4375, + "grad_norm_var": 0.020633951822916666, + "learning_rate": 0.0001, + "loss": 7.4016, + "loss/crossentropy": 2.3994847536087036, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2378486543893814, + "step": 4882 + }, + { + "epoch": 0.30525, + "grad_norm": 2.046875, + "grad_norm_var": 0.028446451822916666, + "learning_rate": 0.0001, + "loss": 7.3097, + "loss/crossentropy": 2.103451728820801, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20587532967329025, + "step": 4884 + }, + { + "epoch": 0.305375, + "grad_norm": 2.375, + "grad_norm_var": 0.019172159830729167, + "learning_rate": 0.0001, + "loss": 7.2349, + "loss/crossentropy": 2.2608002424240112, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21245233714580536, + "step": 4886 + }, + { + "epoch": 0.3055, + "grad_norm": 2.25, + "grad_norm_var": 0.018586222330729166, + "learning_rate": 0.0001, + "loss": 7.4389, + "loss/crossentropy": 2.401768207550049, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22278450429439545, + "step": 4888 + }, + { + "epoch": 0.305625, + "grad_norm": 2.296875, + "grad_norm_var": 0.018798828125, + "learning_rate": 0.0001, + "loss": 7.3505, + "loss/crossentropy": 2.100538969039917, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23200103640556335, + "step": 4890 + }, + { + "epoch": 0.30575, + "grad_norm": 2.40625, + "grad_norm_var": 0.03186442057291667, + "learning_rate": 0.0001, + "loss": 7.2995, + "loss/crossentropy": 2.3301326036453247, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.24110360443592072, + "step": 4892 + }, + { + "epoch": 0.305875, + "grad_norm": 2.3125, + "grad_norm_var": 0.03230692545572917, + "learning_rate": 0.0001, + "loss": 7.3208, + "loss/crossentropy": 2.4211736917495728, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22261983156204224, + "step": 4894 + }, + { + "epoch": 0.306, + "grad_norm": 2.125, + "grad_norm_var": 0.030964152018229166, + "learning_rate": 0.0001, + "loss": 7.2336, + "loss/crossentropy": 2.1695640087127686, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20674562454223633, + "step": 4896 + }, + { + "epoch": 0.306125, + "grad_norm": 2.09375, + "grad_norm_var": 0.031769816080729166, + "learning_rate": 0.0001, + "loss": 7.2113, + "loss/crossentropy": 2.0899396538734436, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20201702415943146, + "step": 4898 + }, + { + "epoch": 0.30625, + "grad_norm": 2.25, + "grad_norm_var": 0.025055948893229166, + "learning_rate": 0.0001, + "loss": 7.1833, + "loss/crossentropy": 2.3102033138275146, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22870182991027832, + "step": 4900 + }, + { + "epoch": 0.306375, + "grad_norm": 2.265625, + "grad_norm_var": 0.024462890625, + "learning_rate": 0.0001, + "loss": 7.429, + "loss/crossentropy": 2.093406856060028, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21099549531936646, + "step": 4902 + }, + { + "epoch": 0.3065, + "grad_norm": 2.234375, + "grad_norm_var": 0.024054972330729167, + "learning_rate": 0.0001, + "loss": 7.2861, + "loss/crossentropy": 2.1686906814575195, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20351973176002502, + "step": 4904 + }, + { + "epoch": 0.306625, + "grad_norm": 2.390625, + "grad_norm_var": 0.024247233072916666, + "learning_rate": 0.0001, + "loss": 7.4273, + "loss/crossentropy": 2.0178955793380737, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2049720138311386, + "step": 4906 + }, + { + "epoch": 0.30675, + "grad_norm": 2.15625, + "grad_norm_var": 0.00933837890625, + "learning_rate": 0.0001, + "loss": 7.3421, + "loss/crossentropy": 2.164846181869507, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.20680507272481918, + "step": 4908 + }, + { + "epoch": 0.306875, + "grad_norm": 2.28125, + "grad_norm_var": 0.009398396809895833, + "learning_rate": 0.0001, + "loss": 7.3451, + "loss/crossentropy": 2.324475646018982, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22731740027666092, + "step": 4910 + }, + { + "epoch": 0.307, + "grad_norm": 2.34375, + "grad_norm_var": 0.009642537434895833, + "learning_rate": 0.0001, + "loss": 7.3873, + "loss/crossentropy": 2.7607351541519165, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.239481620490551, + "step": 4912 + }, + { + "epoch": 0.307125, + "grad_norm": 2.15625, + "grad_norm_var": 0.0101470947265625, + "learning_rate": 0.0001, + "loss": 7.3776, + "loss/crossentropy": 2.286571979522705, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2324925810098648, + "step": 4914 + }, + { + "epoch": 0.30725, + "grad_norm": 2.203125, + "grad_norm_var": 0.009251912434895834, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 2.0468556880950928, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.220846489071846, + "step": 4916 + }, + { + "epoch": 0.307375, + "grad_norm": 2.34375, + "grad_norm_var": 0.017464192708333333, + "learning_rate": 0.0001, + "loss": 7.4308, + "loss/crossentropy": 2.3432207107543945, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21841663122177124, + "step": 4918 + }, + { + "epoch": 0.3075, + "grad_norm": 2.4375, + "grad_norm_var": 0.0173248291015625, + "learning_rate": 0.0001, + "loss": 7.3261, + "loss/crossentropy": 1.9974586367607117, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.20223890990018845, + "step": 4920 + }, + { + "epoch": 0.307625, + "grad_norm": 2.265625, + "grad_norm_var": 0.017601521809895833, + "learning_rate": 0.0001, + "loss": 7.3917, + "loss/crossentropy": 2.1487491130828857, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20649082213640213, + "step": 4922 + }, + { + "epoch": 0.30775, + "grad_norm": 2.265625, + "grad_norm_var": 0.016162109375, + "learning_rate": 0.0001, + "loss": 7.3709, + "loss/crossentropy": 2.0778582096099854, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2082173004746437, + "step": 4924 + }, + { + "epoch": 0.307875, + "grad_norm": 2.140625, + "grad_norm_var": 0.0177642822265625, + "learning_rate": 0.0001, + "loss": 7.4308, + "loss/crossentropy": 2.4916462898254395, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23393811285495758, + "step": 4926 + }, + { + "epoch": 0.308, + "grad_norm": 2.390625, + "grad_norm_var": 0.016893513997395835, + "learning_rate": 0.0001, + "loss": 7.2895, + "loss/crossentropy": 2.1852511167526245, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1971207559108734, + "step": 4928 + }, + { + "epoch": 0.308125, + "grad_norm": 2.40625, + "grad_norm_var": 0.014404296875, + "learning_rate": 0.0001, + "loss": 7.4778, + "loss/crossentropy": 2.4292192459106445, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20822913944721222, + "step": 4930 + }, + { + "epoch": 0.30825, + "grad_norm": 2.09375, + "grad_norm_var": 0.017508951822916667, + "learning_rate": 0.0001, + "loss": 7.3199, + "loss/crossentropy": 2.162381172180176, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21929100155830383, + "step": 4932 + }, + { + "epoch": 0.308375, + "grad_norm": 2.265625, + "grad_norm_var": 0.00947265625, + "learning_rate": 0.0001, + "loss": 7.325, + "loss/crossentropy": 2.1474106311798096, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2182038575410843, + "step": 4934 + }, + { + "epoch": 0.3085, + "grad_norm": 2.109375, + "grad_norm_var": 0.009033203125, + "learning_rate": 0.0001, + "loss": 7.4231, + "loss/crossentropy": 2.2566583156585693, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20229701697826385, + "step": 4936 + }, + { + "epoch": 0.308625, + "grad_norm": 2.234375, + "grad_norm_var": 0.0088775634765625, + "learning_rate": 0.0001, + "loss": 7.4338, + "loss/crossentropy": 2.361881971359253, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22635971009731293, + "step": 4938 + }, + { + "epoch": 0.30875, + "grad_norm": 2.515625, + "grad_norm_var": 0.014403279622395833, + "learning_rate": 0.0001, + "loss": 7.4386, + "loss/crossentropy": 2.214319109916687, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23132434487342834, + "step": 4940 + }, + { + "epoch": 0.308875, + "grad_norm": 2.5, + "grad_norm_var": 0.016576131184895832, + "learning_rate": 0.0001, + "loss": 7.3068, + "loss/crossentropy": 2.063450336456299, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20887789130210876, + "step": 4942 + }, + { + "epoch": 0.309, + "grad_norm": 2.0625, + "grad_norm_var": 0.018062337239583334, + "learning_rate": 0.0001, + "loss": 7.3097, + "loss/crossentropy": 2.1654560565948486, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20234611630439758, + "step": 4944 + }, + { + "epoch": 0.309125, + "grad_norm": 2.234375, + "grad_norm_var": 0.0166168212890625, + "learning_rate": 0.0001, + "loss": 7.3565, + "loss/crossentropy": 2.086349129676819, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.1944892778992653, + "step": 4946 + }, + { + "epoch": 0.30925, + "grad_norm": 2.359375, + "grad_norm_var": 0.015819295247395834, + "learning_rate": 0.0001, + "loss": 7.4902, + "loss/crossentropy": 2.4319673776626587, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2345517948269844, + "step": 4948 + }, + { + "epoch": 0.309375, + "grad_norm": 2.25, + "grad_norm_var": 0.015869140625, + "learning_rate": 0.0001, + "loss": 7.3198, + "loss/crossentropy": 2.2893803119659424, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21564996242523193, + "step": 4950 + }, + { + "epoch": 0.3095, + "grad_norm": 2.15625, + "grad_norm_var": 0.0133697509765625, + "learning_rate": 0.0001, + "loss": 7.3128, + "loss/crossentropy": 2.2920368909835815, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.227824367582798, + "step": 4952 + }, + { + "epoch": 0.309625, + "grad_norm": 2.234375, + "grad_norm_var": 0.0130035400390625, + "learning_rate": 0.0001, + "loss": 7.2905, + "loss/crossentropy": 2.1902358531951904, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22382274270057678, + "step": 4954 + }, + { + "epoch": 0.30975, + "grad_norm": 2.25, + "grad_norm_var": 0.008885701497395834, + "learning_rate": 0.0001, + "loss": 7.1857, + "loss/crossentropy": 2.1179298162460327, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.19988138228654861, + "step": 4956 + }, + { + "epoch": 0.309875, + "grad_norm": 2.265625, + "grad_norm_var": 0.0048980712890625, + "learning_rate": 0.0001, + "loss": 7.3741, + "loss/crossentropy": 2.7137014865875244, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2347271665930748, + "step": 4958 + }, + { + "epoch": 0.31, + "grad_norm": 2.328125, + "grad_norm_var": 0.0032379150390625, + "learning_rate": 0.0001, + "loss": 7.5647, + "loss/crossentropy": 2.31722891330719, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21784493327140808, + "step": 4960 + }, + { + "epoch": 0.310125, + "grad_norm": 2.671875, + "grad_norm_var": 0.0297760009765625, + "learning_rate": 0.0001, + "loss": 7.5544, + "loss/crossentropy": 2.3354387283325195, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2259213700890541, + "step": 4962 + }, + { + "epoch": 0.31025, + "grad_norm": 2.03125, + "grad_norm_var": 0.0390777587890625, + "learning_rate": 0.0001, + "loss": 7.2878, + "loss/crossentropy": 1.9992610216140747, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.206348218023777, + "step": 4964 + }, + { + "epoch": 0.310375, + "grad_norm": 2.25, + "grad_norm_var": 0.041304524739583334, + "learning_rate": 0.0001, + "loss": 7.1877, + "loss/crossentropy": 2.0673335790634155, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21330340206623077, + "step": 4966 + }, + { + "epoch": 0.3105, + "grad_norm": 2.109375, + "grad_norm_var": 0.04544270833333333, + "learning_rate": 0.0001, + "loss": 7.2614, + "loss/crossentropy": 2.302871584892273, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2201063483953476, + "step": 4968 + }, + { + "epoch": 0.310625, + "grad_norm": 2.421875, + "grad_norm_var": 0.0468414306640625, + "learning_rate": 0.0001, + "loss": 7.4131, + "loss/crossentropy": 2.3394633531570435, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2507524937391281, + "step": 4970 + }, + { + "epoch": 0.31075, + "grad_norm": 2.171875, + "grad_norm_var": 0.04869384765625, + "learning_rate": 0.0001, + "loss": 7.5202, + "loss/crossentropy": 2.5619258880615234, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2357812374830246, + "step": 4972 + }, + { + "epoch": 0.310875, + "grad_norm": 2.578125, + "grad_norm_var": 0.05373433430989583, + "learning_rate": 0.0001, + "loss": 7.3841, + "loss/crossentropy": 2.0160459876060486, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22442921996116638, + "step": 4974 + }, + { + "epoch": 0.311, + "grad_norm": 2.109375, + "grad_norm_var": 0.0671875, + "learning_rate": 0.0001, + "loss": 7.4319, + "loss/crossentropy": 2.474762439727783, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22143270820379257, + "step": 4976 + }, + { + "epoch": 0.311125, + "grad_norm": 2.171875, + "grad_norm_var": 0.04348551432291667, + "learning_rate": 0.0001, + "loss": 7.3106, + "loss/crossentropy": 2.3460559844970703, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21027249097824097, + "step": 4978 + }, + { + "epoch": 0.31125, + "grad_norm": 2.21875, + "grad_norm_var": 0.0361328125, + "learning_rate": 0.0001, + "loss": 7.2274, + "loss/crossentropy": 1.8983039855957031, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.19185489416122437, + "step": 4980 + }, + { + "epoch": 0.311375, + "grad_norm": 2.34375, + "grad_norm_var": 0.0352447509765625, + "learning_rate": 0.0001, + "loss": 7.5014, + "loss/crossentropy": 2.4294410943984985, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21551431715488434, + "step": 4982 + }, + { + "epoch": 0.3115, + "grad_norm": 2.484375, + "grad_norm_var": 0.030516560872395834, + "learning_rate": 0.0001, + "loss": 7.1866, + "loss/crossentropy": 2.183108687400818, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21338380873203278, + "step": 4984 + }, + { + "epoch": 0.311625, + "grad_norm": 2.21875, + "grad_norm_var": 0.033463541666666666, + "learning_rate": 0.0001, + "loss": 7.3706, + "loss/crossentropy": 2.182175934314728, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21428421884775162, + "step": 4986 + }, + { + "epoch": 0.31175, + "grad_norm": 2.40625, + "grad_norm_var": 0.031468709309895836, + "learning_rate": 0.0001, + "loss": 7.5321, + "loss/crossentropy": 2.381687641143799, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21604669839143753, + "step": 4988 + }, + { + "epoch": 0.311875, + "grad_norm": 2.203125, + "grad_norm_var": 0.02672119140625, + "learning_rate": 0.0001, + "loss": 7.3109, + "loss/crossentropy": 2.211581826210022, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22757954895496368, + "step": 4990 + }, + { + "epoch": 0.312, + "grad_norm": 2.328125, + "grad_norm_var": 0.012214152018229167, + "learning_rate": 0.0001, + "loss": 7.3448, + "loss/crossentropy": 2.164597749710083, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.229446142911911, + "step": 4992 + }, + { + "epoch": 0.312125, + "grad_norm": 2.3125, + "grad_norm_var": 0.009943644205729166, + "learning_rate": 0.0001, + "loss": 7.2848, + "loss/crossentropy": 2.303720474243164, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21908074617385864, + "step": 4994 + }, + { + "epoch": 0.31225, + "grad_norm": 2.21875, + "grad_norm_var": 0.011351521809895833, + "learning_rate": 0.0001, + "loss": 7.4035, + "loss/crossentropy": 2.4575328826904297, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22248198091983795, + "step": 4996 + }, + { + "epoch": 0.312375, + "grad_norm": 2.453125, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 7.507, + "loss/crossentropy": 2.3077362775802612, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21921824663877487, + "step": 4998 + }, + { + "epoch": 0.3125, + "grad_norm": 2.296875, + "grad_norm_var": 0.017406209309895834, + "learning_rate": 0.0001, + "loss": 7.3155, + "loss/crossentropy": 2.1894084215164185, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21899671852588654, + "step": 5000 + }, + { + "epoch": 0.312625, + "grad_norm": 2.34375, + "grad_norm_var": 0.015135701497395833, + "learning_rate": 0.0001, + "loss": 7.3408, + "loss/crossentropy": 2.262491822242737, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19611389189958572, + "step": 5002 + }, + { + "epoch": 0.31275, + "grad_norm": 2.53125, + "grad_norm_var": 0.01842041015625, + "learning_rate": 0.0001, + "loss": 7.2152, + "loss/crossentropy": 1.982454240322113, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.20768950879573822, + "step": 5004 + }, + { + "epoch": 0.312875, + "grad_norm": 2.03125, + "grad_norm_var": 0.023763020833333332, + "learning_rate": 0.0001, + "loss": 7.2139, + "loss/crossentropy": 2.2430403232574463, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21962688863277435, + "step": 5006 + }, + { + "epoch": 0.313, + "grad_norm": 2.359375, + "grad_norm_var": 0.027595011393229167, + "learning_rate": 0.0001, + "loss": 7.3438, + "loss/crossentropy": 2.3544063568115234, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22437047958374023, + "step": 5008 + }, + { + "epoch": 0.313125, + "grad_norm": 2.109375, + "grad_norm_var": 0.03183186848958333, + "learning_rate": 0.0001, + "loss": 7.3206, + "loss/crossentropy": 2.342305541038513, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2293403297662735, + "step": 5010 + }, + { + "epoch": 0.31325, + "grad_norm": 2.28125, + "grad_norm_var": 0.031769816080729166, + "learning_rate": 0.0001, + "loss": 7.2882, + "loss/crossentropy": 2.2680485248565674, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22678761184215546, + "step": 5012 + }, + { + "epoch": 0.313375, + "grad_norm": 2.390625, + "grad_norm_var": 0.024507649739583335, + "learning_rate": 0.0001, + "loss": 7.4037, + "loss/crossentropy": 2.13227915763855, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20939987897872925, + "step": 5014 + }, + { + "epoch": 0.3135, + "grad_norm": 2.15625, + "grad_norm_var": 0.0258697509765625, + "learning_rate": 0.0001, + "loss": 7.3085, + "loss/crossentropy": 2.2183371782302856, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22399604320526123, + "step": 5016 + }, + { + "epoch": 0.313625, + "grad_norm": 2.34375, + "grad_norm_var": 0.023534138997395832, + "learning_rate": 0.0001, + "loss": 7.3265, + "loss/crossentropy": 2.3377569913864136, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21345963329076767, + "step": 5018 + }, + { + "epoch": 0.31375, + "grad_norm": 2.140625, + "grad_norm_var": 0.022001139322916665, + "learning_rate": 0.0001, + "loss": 7.203, + "loss/crossentropy": 2.2402291893959045, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2167833298444748, + "step": 5020 + }, + { + "epoch": 0.313875, + "grad_norm": 2.21875, + "grad_norm_var": 0.0185211181640625, + "learning_rate": 0.0001, + "loss": 7.1242, + "loss/crossentropy": 2.1168267726898193, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.20187420397996902, + "step": 5022 + }, + { + "epoch": 0.314, + "grad_norm": 2.328125, + "grad_norm_var": 0.017838541666666666, + "learning_rate": 0.0001, + "loss": 7.3825, + "loss/crossentropy": 2.1705461740493774, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21749653667211533, + "step": 5024 + }, + { + "epoch": 0.314125, + "grad_norm": 2.453125, + "grad_norm_var": 0.015705362955729166, + "learning_rate": 0.0001, + "loss": 7.3272, + "loss/crossentropy": 2.2871659994125366, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23058424144983292, + "step": 5026 + }, + { + "epoch": 0.31425, + "grad_norm": 2.296875, + "grad_norm_var": 0.0137359619140625, + "learning_rate": 0.0001, + "loss": 7.3611, + "loss/crossentropy": 2.341141700744629, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23287169635295868, + "step": 5028 + }, + { + "epoch": 0.314375, + "grad_norm": 2.25, + "grad_norm_var": 0.013993326822916667, + "learning_rate": 0.0001, + "loss": 7.3674, + "loss/crossentropy": 1.9995309114456177, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20135702937841415, + "step": 5030 + }, + { + "epoch": 0.3145, + "grad_norm": 2.3125, + "grad_norm_var": 0.012694295247395833, + "learning_rate": 0.0001, + "loss": 7.2974, + "loss/crossentropy": 2.396213173866272, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22502055764198303, + "step": 5032 + }, + { + "epoch": 0.314625, + "grad_norm": 2.234375, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 7.3376, + "loss/crossentropy": 2.290814518928528, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21020909398794174, + "step": 5034 + }, + { + "epoch": 0.31475, + "grad_norm": 2.0, + "grad_norm_var": 0.017414347330729166, + "learning_rate": 0.0001, + "loss": 7.2327, + "loss/crossentropy": 2.5261220932006836, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21174553781747818, + "step": 5036 + }, + { + "epoch": 0.314875, + "grad_norm": 2.40625, + "grad_norm_var": 0.020197550455729168, + "learning_rate": 0.0001, + "loss": 7.2107, + "loss/crossentropy": 2.3141993284225464, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21525625884532928, + "step": 5038 + }, + { + "epoch": 0.315, + "grad_norm": 2.640625, + "grad_norm_var": 0.04215087890625, + "learning_rate": 0.0001, + "loss": 7.3146, + "loss/crossentropy": 2.3335756063461304, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21623239666223526, + "step": 5040 + }, + { + "epoch": 0.315125, + "grad_norm": 2.09375, + "grad_norm_var": 0.045633951822916664, + "learning_rate": 0.0001, + "loss": 7.2735, + "loss/crossentropy": 2.2438907623291016, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22927288711071014, + "step": 5042 + }, + { + "epoch": 0.31525, + "grad_norm": 2.421875, + "grad_norm_var": 0.047261555989583336, + "learning_rate": 0.0001, + "loss": 7.4352, + "loss/crossentropy": 2.2331418991088867, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21440240740776062, + "step": 5044 + }, + { + "epoch": 0.315375, + "grad_norm": 2.46875, + "grad_norm_var": 0.0459136962890625, + "learning_rate": 0.0001, + "loss": 7.4132, + "loss/crossentropy": 2.272444486618042, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2074766531586647, + "step": 5046 + }, + { + "epoch": 0.3155, + "grad_norm": 2.140625, + "grad_norm_var": 0.047526041666666664, + "learning_rate": 0.0001, + "loss": 7.1822, + "loss/crossentropy": 2.2327855825424194, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.23146355152130127, + "step": 5048 + }, + { + "epoch": 0.315625, + "grad_norm": 2.3125, + "grad_norm_var": 0.048151652018229164, + "learning_rate": 0.0001, + "loss": 7.1235, + "loss/crossentropy": 2.138622522354126, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2159329056739807, + "step": 5050 + }, + { + "epoch": 0.31575, + "grad_norm": 2.359375, + "grad_norm_var": 0.03786519368489583, + "learning_rate": 0.0001, + "loss": 7.3683, + "loss/crossentropy": 2.407191514968872, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2199866697192192, + "step": 5052 + }, + { + "epoch": 0.315875, + "grad_norm": 2.390625, + "grad_norm_var": 0.03487955729166667, + "learning_rate": 0.0001, + "loss": 7.3308, + "loss/crossentropy": 2.131745457649231, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.209860198199749, + "step": 5054 + }, + { + "epoch": 0.316, + "grad_norm": 2.125, + "grad_norm_var": 0.018387858072916666, + "learning_rate": 0.0001, + "loss": 7.2193, + "loss/crossentropy": 2.0351319909095764, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2149025946855545, + "step": 5056 + }, + { + "epoch": 0.316125, + "grad_norm": 2.234375, + "grad_norm_var": 0.014046223958333333, + "learning_rate": 0.0001, + "loss": 7.1825, + "loss/crossentropy": 2.4224579334259033, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21975557506084442, + "step": 5058 + }, + { + "epoch": 0.31625, + "grad_norm": 2.125, + "grad_norm_var": 0.015070597330729166, + "learning_rate": 0.0001, + "loss": 7.273, + "loss/crossentropy": 2.1384547352790833, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2082139551639557, + "step": 5060 + }, + { + "epoch": 0.316375, + "grad_norm": 2.203125, + "grad_norm_var": 0.013044230143229167, + "learning_rate": 0.0001, + "loss": 7.3584, + "loss/crossentropy": 2.4295462369918823, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22232317924499512, + "step": 5062 + }, + { + "epoch": 0.3165, + "grad_norm": 2.171875, + "grad_norm_var": 0.0131256103515625, + "learning_rate": 0.0001, + "loss": 7.3255, + "loss/crossentropy": 2.2445785999298096, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2112947255373001, + "step": 5064 + }, + { + "epoch": 0.316625, + "grad_norm": 2.265625, + "grad_norm_var": 0.0092193603515625, + "learning_rate": 0.0001, + "loss": 7.3904, + "loss/crossentropy": 2.491084575653076, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23876774311065674, + "step": 5066 + }, + { + "epoch": 0.31675, + "grad_norm": 2.265625, + "grad_norm_var": 0.00875244140625, + "learning_rate": 0.0001, + "loss": 7.3886, + "loss/crossentropy": 2.209762454032898, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21998873353004456, + "step": 5068 + }, + { + "epoch": 0.316875, + "grad_norm": 2.1875, + "grad_norm_var": 0.007373046875, + "learning_rate": 0.0001, + "loss": 7.354, + "loss/crossentropy": 2.184313654899597, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21638303250074387, + "step": 5070 + }, + { + "epoch": 0.317, + "grad_norm": 2.40625, + "grad_norm_var": 0.00654296875, + "learning_rate": 0.0001, + "loss": 7.2445, + "loss/crossentropy": 1.932084858417511, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20044665038585663, + "step": 5072 + }, + { + "epoch": 0.317125, + "grad_norm": 2.140625, + "grad_norm_var": 0.00751953125, + "learning_rate": 0.0001, + "loss": 7.147, + "loss/crossentropy": 2.099605619907379, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1961287409067154, + "step": 5074 + }, + { + "epoch": 0.31725, + "grad_norm": 2.40625, + "grad_norm_var": 0.0076080322265625, + "learning_rate": 0.0001, + "loss": 7.4359, + "loss/crossentropy": 2.4930461645126343, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.19766415655612946, + "step": 5076 + }, + { + "epoch": 0.317375, + "grad_norm": 2.09375, + "grad_norm_var": 0.009847005208333334, + "learning_rate": 0.0001, + "loss": 7.2892, + "loss/crossentropy": 2.0933732390403748, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2190747633576393, + "step": 5078 + }, + { + "epoch": 0.3175, + "grad_norm": 2.34375, + "grad_norm_var": 0.009794108072916667, + "learning_rate": 0.0001, + "loss": 7.2631, + "loss/crossentropy": 2.2265865802764893, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19832175970077515, + "step": 5080 + }, + { + "epoch": 0.317625, + "grad_norm": 2.078125, + "grad_norm_var": 0.012955729166666667, + "learning_rate": 0.0001, + "loss": 7.3764, + "loss/crossentropy": 2.1202717423439026, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22476188838481903, + "step": 5082 + }, + { + "epoch": 0.31775, + "grad_norm": 2.171875, + "grad_norm_var": 0.013671875, + "learning_rate": 0.0001, + "loss": 7.182, + "loss/crossentropy": 2.208008825778961, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21714557707309723, + "step": 5084 + }, + { + "epoch": 0.317875, + "grad_norm": 2.328125, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 7.2372, + "loss/crossentropy": 2.3373151421546936, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23226696997880936, + "step": 5086 + }, + { + "epoch": 0.318, + "grad_norm": 2.40625, + "grad_norm_var": 0.014256795247395834, + "learning_rate": 0.0001, + "loss": 7.3527, + "loss/crossentropy": 2.235350489616394, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.20813613384962082, + "step": 5088 + }, + { + "epoch": 0.318125, + "grad_norm": 2.296875, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 7.3671, + "loss/crossentropy": 2.436918258666992, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2232442870736122, + "step": 5090 + }, + { + "epoch": 0.31825, + "grad_norm": 2.21875, + "grad_norm_var": 0.018684895833333333, + "learning_rate": 0.0001, + "loss": 7.2646, + "loss/crossentropy": 2.3693257570266724, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2209920957684517, + "step": 5092 + }, + { + "epoch": 0.318375, + "grad_norm": 2.171875, + "grad_norm_var": 0.020164998372395833, + "learning_rate": 0.0001, + "loss": 7.4097, + "loss/crossentropy": 2.5247997045516968, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23687294870615005, + "step": 5094 + }, + { + "epoch": 0.3185, + "grad_norm": 2.359375, + "grad_norm_var": 0.020384724934895834, + "learning_rate": 0.0001, + "loss": 7.2837, + "loss/crossentropy": 2.009072959423065, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20863434672355652, + "step": 5096 + }, + { + "epoch": 0.318625, + "grad_norm": 2.09375, + "grad_norm_var": 0.019173177083333333, + "learning_rate": 0.0001, + "loss": 7.1841, + "loss/crossentropy": 2.075112044811249, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2084924578666687, + "step": 5098 + }, + { + "epoch": 0.31875, + "grad_norm": 2.28125, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 7.2721, + "loss/crossentropy": 2.15164053440094, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20492035150527954, + "step": 5100 + }, + { + "epoch": 0.318875, + "grad_norm": 2.390625, + "grad_norm_var": 0.01708984375, + "learning_rate": 0.0001, + "loss": 7.3294, + "loss/crossentropy": 1.9480576515197754, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20920200645923615, + "step": 5102 + }, + { + "epoch": 0.319, + "grad_norm": 2.09375, + "grad_norm_var": 0.018163045247395832, + "learning_rate": 0.0001, + "loss": 7.3463, + "loss/crossentropy": 2.2869019508361816, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21756701171398163, + "step": 5104 + }, + { + "epoch": 0.319125, + "grad_norm": 2.28125, + "grad_norm_var": 0.017650349934895834, + "learning_rate": 0.0001, + "loss": 7.1405, + "loss/crossentropy": 2.0993104577064514, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21797939389944077, + "step": 5106 + }, + { + "epoch": 0.31925, + "grad_norm": 2.234375, + "grad_norm_var": 0.0272613525390625, + "learning_rate": 0.0001, + "loss": 7.3592, + "loss/crossentropy": 2.4255337715148926, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21611501276493073, + "step": 5108 + }, + { + "epoch": 0.319375, + "grad_norm": 2.21875, + "grad_norm_var": 0.0240631103515625, + "learning_rate": 0.0001, + "loss": 7.2214, + "loss/crossentropy": 2.370956540107727, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.23814254999160767, + "step": 5110 + }, + { + "epoch": 0.3195, + "grad_norm": 2.421875, + "grad_norm_var": 0.025191243489583334, + "learning_rate": 0.0001, + "loss": 7.2919, + "loss/crossentropy": 2.354593515396118, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22845982760190964, + "step": 5112 + }, + { + "epoch": 0.319625, + "grad_norm": 2.171875, + "grad_norm_var": 0.023582967122395833, + "learning_rate": 0.0001, + "loss": 7.3072, + "loss/crossentropy": 2.1754024028778076, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.23337450623512268, + "step": 5114 + }, + { + "epoch": 0.31975, + "grad_norm": 2.34375, + "grad_norm_var": 0.0242340087890625, + "learning_rate": 0.0001, + "loss": 7.3127, + "loss/crossentropy": 2.1386443972587585, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2071130946278572, + "step": 5116 + }, + { + "epoch": 0.319875, + "grad_norm": 2.234375, + "grad_norm_var": 0.023531087239583335, + "learning_rate": 0.0001, + "loss": 7.3132, + "loss/crossentropy": 2.3322278261184692, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21129751950502396, + "step": 5118 + }, + { + "epoch": 0.32, + "grad_norm": 2.265625, + "grad_norm_var": 0.0241119384765625, + "learning_rate": 0.0001, + "loss": 7.2513, + "loss/crossentropy": 2.294466018676758, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.20739653706550598, + "step": 5120 + }, + { + "epoch": 0.320125, + "grad_norm": 2.171875, + "grad_norm_var": 0.023388671875, + "learning_rate": 0.0001, + "loss": 7.1934, + "loss/crossentropy": 2.16942036151886, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.19805258512496948, + "step": 5122 + }, + { + "epoch": 0.32025, + "grad_norm": 2.625, + "grad_norm_var": 0.018831380208333335, + "learning_rate": 0.0001, + "loss": 7.507, + "loss/crossentropy": 2.132123589515686, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21495044976472855, + "step": 5124 + }, + { + "epoch": 0.320375, + "grad_norm": 2.0, + "grad_norm_var": 0.024616495768229166, + "learning_rate": 0.0001, + "loss": 7.2239, + "loss/crossentropy": 1.760918915271759, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.17673537880182266, + "step": 5126 + }, + { + "epoch": 0.3205, + "grad_norm": 2.3125, + "grad_norm_var": 0.024144490559895832, + "learning_rate": 0.0001, + "loss": 7.4896, + "loss/crossentropy": 2.4206674098968506, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2217917963862419, + "step": 5128 + }, + { + "epoch": 0.320625, + "grad_norm": 2.28125, + "grad_norm_var": 0.025658162434895833, + "learning_rate": 0.0001, + "loss": 7.2327, + "loss/crossentropy": 2.1627532839775085, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20185734331607819, + "step": 5130 + }, + { + "epoch": 0.32075, + "grad_norm": 2.3125, + "grad_norm_var": 0.0330078125, + "learning_rate": 0.0001, + "loss": 7.4634, + "loss/crossentropy": 2.143425464630127, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21182992309331894, + "step": 5132 + }, + { + "epoch": 0.320875, + "grad_norm": 2.375, + "grad_norm_var": 0.03430887858072917, + "learning_rate": 0.0001, + "loss": 7.3735, + "loss/crossentropy": 1.9043779969215393, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.19466465711593628, + "step": 5134 + }, + { + "epoch": 0.321, + "grad_norm": 2.046875, + "grad_norm_var": 0.04011942545572917, + "learning_rate": 0.0001, + "loss": 7.2799, + "loss/crossentropy": 2.3145501613616943, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23457962274551392, + "step": 5136 + }, + { + "epoch": 0.321125, + "grad_norm": 2.484375, + "grad_norm_var": 0.040648396809895834, + "learning_rate": 0.0001, + "loss": 7.3412, + "loss/crossentropy": 2.338579297065735, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23789451271295547, + "step": 5138 + }, + { + "epoch": 0.32125, + "grad_norm": 2.578125, + "grad_norm_var": 0.036848958333333334, + "learning_rate": 0.0001, + "loss": 7.509, + "loss/crossentropy": 2.0653016567230225, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23478703200817108, + "step": 5140 + }, + { + "epoch": 0.321375, + "grad_norm": 2.25, + "grad_norm_var": 0.029866536458333332, + "learning_rate": 0.0001, + "loss": 7.4456, + "loss/crossentropy": 2.146829605102539, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2174145206809044, + "step": 5142 + }, + { + "epoch": 0.3215, + "grad_norm": 2.28125, + "grad_norm_var": 0.030256144205729165, + "learning_rate": 0.0001, + "loss": 7.2912, + "loss/crossentropy": 2.283796191215515, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22230414301156998, + "step": 5144 + }, + { + "epoch": 0.321625, + "grad_norm": 2.703125, + "grad_norm_var": 0.040022786458333334, + "learning_rate": 0.0001, + "loss": 7.3014, + "loss/crossentropy": 2.309414267539978, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.24046258628368378, + "step": 5146 + }, + { + "epoch": 0.32175, + "grad_norm": 1.9375, + "grad_norm_var": 0.045531209309895834, + "learning_rate": 0.0001, + "loss": 7.1654, + "loss/crossentropy": 2.361426830291748, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21131903678178787, + "step": 5148 + }, + { + "epoch": 0.321875, + "grad_norm": 2.40625, + "grad_norm_var": 0.048628743489583334, + "learning_rate": 0.0001, + "loss": 7.3457, + "loss/crossentropy": 2.2447391748428345, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.23755492269992828, + "step": 5150 + }, + { + "epoch": 0.322, + "grad_norm": 2.171875, + "grad_norm_var": 0.041304524739583334, + "learning_rate": 0.0001, + "loss": 7.2372, + "loss/crossentropy": 2.0738277435302734, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.20171590149402618, + "step": 5152 + }, + { + "epoch": 0.322125, + "grad_norm": 2.21875, + "grad_norm_var": 0.0625396728515625, + "learning_rate": 0.0001, + "loss": 7.3119, + "loss/crossentropy": 2.452193021774292, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2211400717496872, + "step": 5154 + }, + { + "epoch": 0.32225, + "grad_norm": 2.359375, + "grad_norm_var": 0.05943094889322917, + "learning_rate": 0.0001, + "loss": 7.5275, + "loss/crossentropy": 2.1492894887924194, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22379465401172638, + "step": 5156 + }, + { + "epoch": 0.322375, + "grad_norm": 2.296875, + "grad_norm_var": 0.05969950358072917, + "learning_rate": 0.0001, + "loss": 7.1852, + "loss/crossentropy": 2.1073238849639893, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22517702728509903, + "step": 5158 + }, + { + "epoch": 0.3225, + "grad_norm": 2.375, + "grad_norm_var": 0.06669921875, + "learning_rate": 0.0001, + "loss": 7.4154, + "loss/crossentropy": 2.211340069770813, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21814390271902084, + "step": 5160 + }, + { + "epoch": 0.322625, + "grad_norm": 2.3125, + "grad_norm_var": 0.05657145182291667, + "learning_rate": 0.0001, + "loss": 7.4603, + "loss/crossentropy": 2.1871854066848755, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21511095762252808, + "step": 5162 + }, + { + "epoch": 0.32275, + "grad_norm": 2.46875, + "grad_norm_var": 0.04453837076822917, + "learning_rate": 0.0001, + "loss": 7.3515, + "loss/crossentropy": 2.2441636323928833, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.23230791091918945, + "step": 5164 + }, + { + "epoch": 0.322875, + "grad_norm": 2.15625, + "grad_norm_var": 0.040892537434895834, + "learning_rate": 0.0001, + "loss": 7.1981, + "loss/crossentropy": 2.134332001209259, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21239123493433, + "step": 5166 + }, + { + "epoch": 0.323, + "grad_norm": 3.125, + "grad_norm_var": 0.07382405598958333, + "learning_rate": 0.0001, + "loss": 7.5831, + "loss/crossentropy": 2.3247495889663696, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21503940969705582, + "step": 5168 + }, + { + "epoch": 0.323125, + "grad_norm": 2.484375, + "grad_norm_var": 0.059244791666666664, + "learning_rate": 0.0001, + "loss": 7.5669, + "loss/crossentropy": 2.187830626964569, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.24586067348718643, + "step": 5170 + }, + { + "epoch": 0.32325, + "grad_norm": 2.265625, + "grad_norm_var": 0.06236572265625, + "learning_rate": 0.0001, + "loss": 7.2922, + "loss/crossentropy": 2.291743755340576, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22773872315883636, + "step": 5172 + }, + { + "epoch": 0.323375, + "grad_norm": 2.3125, + "grad_norm_var": 0.06570536295572917, + "learning_rate": 0.0001, + "loss": 7.2504, + "loss/crossentropy": 2.20532488822937, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21305081248283386, + "step": 5174 + }, + { + "epoch": 0.3235, + "grad_norm": 2.25, + "grad_norm_var": 0.06047770182291667, + "learning_rate": 0.0001, + "loss": 7.3579, + "loss/crossentropy": 2.226097345352173, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.223523810505867, + "step": 5176 + }, + { + "epoch": 0.323625, + "grad_norm": 2.234375, + "grad_norm_var": 0.06304931640625, + "learning_rate": 0.0001, + "loss": 7.0957, + "loss/crossentropy": 2.05389004945755, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20273905247449875, + "step": 5178 + }, + { + "epoch": 0.32375, + "grad_norm": 2.109375, + "grad_norm_var": 0.06721089680989584, + "learning_rate": 0.0001, + "loss": 7.2408, + "loss/crossentropy": 2.0276909470558167, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20203246176242828, + "step": 5180 + }, + { + "epoch": 0.323875, + "grad_norm": 2.328125, + "grad_norm_var": 0.06479390462239583, + "learning_rate": 0.0001, + "loss": 7.5044, + "loss/crossentropy": 2.2829922437667847, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21530025452375412, + "step": 5182 + }, + { + "epoch": 0.324, + "grad_norm": 2.125, + "grad_norm_var": 0.023453776041666666, + "learning_rate": 0.0001, + "loss": 7.3556, + "loss/crossentropy": 2.1827311515808105, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.218561053276062, + "step": 5184 + }, + { + "epoch": 0.324125, + "grad_norm": 2.234375, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 7.3249, + "loss/crossentropy": 2.091078519821167, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23558902740478516, + "step": 5186 + }, + { + "epoch": 0.32425, + "grad_norm": 2.84375, + "grad_norm_var": 0.0319244384765625, + "learning_rate": 0.0001, + "loss": 7.3744, + "loss/crossentropy": 1.8757956624031067, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19631417095661163, + "step": 5188 + }, + { + "epoch": 0.324375, + "grad_norm": 2.328125, + "grad_norm_var": 0.031037394205729166, + "learning_rate": 0.0001, + "loss": 7.284, + "loss/crossentropy": 2.342907428741455, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22963730245828629, + "step": 5190 + }, + { + "epoch": 0.3245, + "grad_norm": 2.28125, + "grad_norm_var": 0.027179972330729166, + "learning_rate": 0.0001, + "loss": 7.3116, + "loss/crossentropy": 2.244239926338196, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2171139419078827, + "step": 5192 + }, + { + "epoch": 0.324625, + "grad_norm": 2.3125, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 7.26, + "loss/crossentropy": 2.035650849342346, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2151600569486618, + "step": 5194 + }, + { + "epoch": 0.32475, + "grad_norm": 2.265625, + "grad_norm_var": 0.023274739583333332, + "learning_rate": 0.0001, + "loss": 7.3792, + "loss/crossentropy": 2.309348702430725, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21702983975410461, + "step": 5196 + }, + { + "epoch": 0.324875, + "grad_norm": 2.140625, + "grad_norm_var": 0.026732381184895834, + "learning_rate": 0.0001, + "loss": 7.1716, + "loss/crossentropy": 2.129282593727112, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21354100108146667, + "step": 5198 + }, + { + "epoch": 0.325, + "grad_norm": 2.28125, + "grad_norm_var": 0.031981404622395834, + "learning_rate": 0.0001, + "loss": 7.2898, + "loss/crossentropy": 2.0915945172309875, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.19767944514751434, + "step": 5200 + }, + { + "epoch": 0.325125, + "grad_norm": 2.515625, + "grad_norm_var": 0.0355133056640625, + "learning_rate": 0.0001, + "loss": 7.249, + "loss/crossentropy": 2.389148235321045, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23801419138908386, + "step": 5202 + }, + { + "epoch": 0.32525, + "grad_norm": 2.203125, + "grad_norm_var": 0.014774576822916666, + "learning_rate": 0.0001, + "loss": 7.4405, + "loss/crossentropy": 2.0830936431884766, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19635440409183502, + "step": 5204 + }, + { + "epoch": 0.325375, + "grad_norm": 2.390625, + "grad_norm_var": 0.016304524739583333, + "learning_rate": 0.0001, + "loss": 7.2434, + "loss/crossentropy": 2.287827789783478, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22433650493621826, + "step": 5206 + }, + { + "epoch": 0.3255, + "grad_norm": 2.1875, + "grad_norm_var": 0.015111287434895834, + "learning_rate": 0.0001, + "loss": 7.4008, + "loss/crossentropy": 2.550079107284546, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.22946575284004211, + "step": 5208 + }, + { + "epoch": 0.325625, + "grad_norm": 2.234375, + "grad_norm_var": 0.0140777587890625, + "learning_rate": 0.0001, + "loss": 7.2935, + "loss/crossentropy": 2.1926426887512207, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2117481455206871, + "step": 5210 + }, + { + "epoch": 0.32575, + "grad_norm": 2.546875, + "grad_norm_var": 0.020099894205729166, + "learning_rate": 0.0001, + "loss": 7.2581, + "loss/crossentropy": 2.2146820425987244, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2380428984761238, + "step": 5212 + }, + { + "epoch": 0.325875, + "grad_norm": 2.046875, + "grad_norm_var": 0.021708170572916668, + "learning_rate": 0.0001, + "loss": 7.2562, + "loss/crossentropy": 2.3124881982803345, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21675492078065872, + "step": 5214 + }, + { + "epoch": 0.326, + "grad_norm": 2.21875, + "grad_norm_var": 0.016161092122395835, + "learning_rate": 0.0001, + "loss": 7.1713, + "loss/crossentropy": 2.3529210090637207, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.223765030503273, + "step": 5216 + }, + { + "epoch": 0.326125, + "grad_norm": 2.46875, + "grad_norm_var": 0.020930989583333334, + "learning_rate": 0.0001, + "loss": 7.3457, + "loss/crossentropy": 2.2348451614379883, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23559778928756714, + "step": 5218 + }, + { + "epoch": 0.32625, + "grad_norm": 2.015625, + "grad_norm_var": 0.026448567708333332, + "learning_rate": 0.0001, + "loss": 7.1213, + "loss/crossentropy": 2.407866954803467, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23116052150726318, + "step": 5220 + }, + { + "epoch": 0.326375, + "grad_norm": 2.390625, + "grad_norm_var": 0.025267537434895834, + "learning_rate": 0.0001, + "loss": 7.43, + "loss/crossentropy": 2.282076358795166, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22940561920404434, + "step": 5222 + }, + { + "epoch": 0.3265, + "grad_norm": 2.109375, + "grad_norm_var": 0.028270467122395834, + "learning_rate": 0.0001, + "loss": 7.2729, + "loss/crossentropy": 2.0684497356414795, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20412250608205795, + "step": 5224 + }, + { + "epoch": 0.326625, + "grad_norm": 2.15625, + "grad_norm_var": 0.029150390625, + "learning_rate": 0.0001, + "loss": 7.4137, + "loss/crossentropy": 2.4470431804656982, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2289155125617981, + "step": 5226 + }, + { + "epoch": 0.32675, + "grad_norm": 2.21875, + "grad_norm_var": 0.025755818684895834, + "learning_rate": 0.0001, + "loss": 7.1882, + "loss/crossentropy": 2.232643723487854, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22333452105522156, + "step": 5228 + }, + { + "epoch": 0.326875, + "grad_norm": 2.21875, + "grad_norm_var": 0.022977701822916665, + "learning_rate": 0.0001, + "loss": 7.1683, + "loss/crossentropy": 1.9237089157104492, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20895886421203613, + "step": 5230 + }, + { + "epoch": 0.327, + "grad_norm": 2.34375, + "grad_norm_var": 0.023460896809895833, + "learning_rate": 0.0001, + "loss": 7.392, + "loss/crossentropy": 2.570519208908081, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2476401850581169, + "step": 5232 + }, + { + "epoch": 0.327125, + "grad_norm": 2.21875, + "grad_norm_var": 0.011909993489583333, + "learning_rate": 0.0001, + "loss": 7.1545, + "loss/crossentropy": 2.332336902618408, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22975466400384903, + "step": 5234 + }, + { + "epoch": 0.32725, + "grad_norm": 3.859375, + "grad_norm_var": 0.1721588134765625, + "learning_rate": 0.0001, + "loss": 7.4894, + "loss/crossentropy": 2.4853312969207764, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22591929137706757, + "step": 5236 + }, + { + "epoch": 0.327375, + "grad_norm": 2.140625, + "grad_norm_var": 0.17515869140625, + "learning_rate": 0.0001, + "loss": 7.3073, + "loss/crossentropy": 2.4176896810531616, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2283809781074524, + "step": 5238 + }, + { + "epoch": 0.3275, + "grad_norm": 2.25, + "grad_norm_var": 0.1721588134765625, + "learning_rate": 0.0001, + "loss": 7.441, + "loss/crossentropy": 2.581329822540283, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.24542076140642166, + "step": 5240 + }, + { + "epoch": 0.327625, + "grad_norm": 2.4375, + "grad_norm_var": 0.16851806640625, + "learning_rate": 0.0001, + "loss": 7.2261, + "loss/crossentropy": 2.2448220252990723, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22246970981359482, + "step": 5242 + }, + { + "epoch": 0.32775, + "grad_norm": 2.15625, + "grad_norm_var": 0.16536051432291668, + "learning_rate": 0.0001, + "loss": 7.3095, + "loss/crossentropy": 2.354673981666565, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2191583588719368, + "step": 5244 + }, + { + "epoch": 0.327875, + "grad_norm": 2.171875, + "grad_norm_var": 0.16868082682291666, + "learning_rate": 0.0001, + "loss": 7.3025, + "loss/crossentropy": 2.3827792406082153, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23692472279071808, + "step": 5246 + }, + { + "epoch": 0.328, + "grad_norm": 2.234375, + "grad_norm_var": 0.17063395182291666, + "learning_rate": 0.0001, + "loss": 7.3818, + "loss/crossentropy": 2.433183193206787, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22385024279356003, + "step": 5248 + }, + { + "epoch": 0.328125, + "grad_norm": 2.28125, + "grad_norm_var": 0.16874898274739583, + "learning_rate": 0.0001, + "loss": 7.3701, + "loss/crossentropy": 2.1747193932533264, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2178487330675125, + "step": 5250 + }, + { + "epoch": 0.32825, + "grad_norm": 2.4375, + "grad_norm_var": 0.01500244140625, + "learning_rate": 0.0001, + "loss": 7.444, + "loss/crossentropy": 2.0669074058532715, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22157609462738037, + "step": 5252 + }, + { + "epoch": 0.328375, + "grad_norm": 2.25, + "grad_norm_var": 0.014655558268229167, + "learning_rate": 0.0001, + "loss": 7.6055, + "loss/crossentropy": 2.4753568172454834, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2559722363948822, + "step": 5254 + }, + { + "epoch": 0.3285, + "grad_norm": 2.28125, + "grad_norm_var": 0.014208984375, + "learning_rate": 0.0001, + "loss": 7.3287, + "loss/crossentropy": 2.3594852685928345, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2157405987381935, + "step": 5256 + }, + { + "epoch": 0.328625, + "grad_norm": 2.3125, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 7.4131, + "loss/crossentropy": 2.2439894676208496, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21909688413143158, + "step": 5258 + }, + { + "epoch": 0.32875, + "grad_norm": 2.375, + "grad_norm_var": 0.010091145833333334, + "learning_rate": 0.0001, + "loss": 7.1475, + "loss/crossentropy": 2.136144995689392, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.22025711089372635, + "step": 5260 + }, + { + "epoch": 0.328875, + "grad_norm": 2.21875, + "grad_norm_var": 0.009544881184895833, + "learning_rate": 0.0001, + "loss": 7.4255, + "loss/crossentropy": 2.1109477281570435, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2135395109653473, + "step": 5262 + }, + { + "epoch": 0.329, + "grad_norm": 2.5, + "grad_norm_var": 0.011844889322916666, + "learning_rate": 0.0001, + "loss": 7.5097, + "loss/crossentropy": 2.205671548843384, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21154307574033737, + "step": 5264 + }, + { + "epoch": 0.329125, + "grad_norm": 2.0, + "grad_norm_var": 0.0176910400390625, + "learning_rate": 0.0001, + "loss": 7.1289, + "loss/crossentropy": 2.044199228286743, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.19973917305469513, + "step": 5266 + }, + { + "epoch": 0.32925, + "grad_norm": 2.578125, + "grad_norm_var": 0.021708170572916668, + "learning_rate": 0.0001, + "loss": 7.4721, + "loss/crossentropy": 2.234253406524658, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2099093198776245, + "step": 5268 + }, + { + "epoch": 0.329375, + "grad_norm": 2.125, + "grad_norm_var": 0.0228912353515625, + "learning_rate": 0.0001, + "loss": 7.2664, + "loss/crossentropy": 2.252587676048279, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21195102483034134, + "step": 5270 + }, + { + "epoch": 0.3295, + "grad_norm": 2.59375, + "grad_norm_var": 0.0292877197265625, + "learning_rate": 0.0001, + "loss": 7.3687, + "loss/crossentropy": 2.134685754776001, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2187366485595703, + "step": 5272 + }, + { + "epoch": 0.329625, + "grad_norm": 2.0625, + "grad_norm_var": 0.03511962890625, + "learning_rate": 0.0001, + "loss": 7.3743, + "loss/crossentropy": 2.278490424156189, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.23104965686798096, + "step": 5274 + }, + { + "epoch": 0.32975, + "grad_norm": 2.359375, + "grad_norm_var": 0.03319905598958333, + "learning_rate": 0.0001, + "loss": 7.2294, + "loss/crossentropy": 2.3663605451583862, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2136199250817299, + "step": 5276 + }, + { + "epoch": 0.329875, + "grad_norm": 2.15625, + "grad_norm_var": 0.03323160807291667, + "learning_rate": 0.0001, + "loss": 7.399, + "loss/crossentropy": 2.3179785013198853, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.20722012221813202, + "step": 5278 + }, + { + "epoch": 0.33, + "grad_norm": 2.3125, + "grad_norm_var": 0.029841105143229168, + "learning_rate": 0.0001, + "loss": 7.4934, + "loss/crossentropy": 2.3918418884277344, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2301948517560959, + "step": 5280 + }, + { + "epoch": 0.330125, + "grad_norm": 2.390625, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 7.2977, + "loss/crossentropy": 2.0368301272392273, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2010180726647377, + "step": 5282 + }, + { + "epoch": 0.33025, + "grad_norm": 2.328125, + "grad_norm_var": 0.020308430989583334, + "learning_rate": 0.0001, + "loss": 7.2924, + "loss/crossentropy": 2.419578790664673, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22721153497695923, + "step": 5284 + }, + { + "epoch": 0.330375, + "grad_norm": 2.34375, + "grad_norm_var": 0.016722615559895834, + "learning_rate": 0.0001, + "loss": 7.2706, + "loss/crossentropy": 2.213322699069977, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2133590281009674, + "step": 5286 + }, + { + "epoch": 0.3305, + "grad_norm": 2.3125, + "grad_norm_var": 0.009862263997395834, + "learning_rate": 0.0001, + "loss": 7.3482, + "loss/crossentropy": 2.2472715377807617, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23156778514385223, + "step": 5288 + }, + { + "epoch": 0.330625, + "grad_norm": 2.203125, + "grad_norm_var": 0.007417805989583333, + "learning_rate": 0.0001, + "loss": 7.1872, + "loss/crossentropy": 2.298330068588257, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20751896500587463, + "step": 5290 + }, + { + "epoch": 0.33075, + "grad_norm": 2.125, + "grad_norm_var": 0.008055623372395833, + "learning_rate": 0.0001, + "loss": 7.3076, + "loss/crossentropy": 2.2136794328689575, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21609289944171906, + "step": 5292 + }, + { + "epoch": 0.330875, + "grad_norm": 2.1875, + "grad_norm_var": 0.007966105143229167, + "learning_rate": 0.0001, + "loss": 7.3623, + "loss/crossentropy": 2.2835363149642944, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22649526596069336, + "step": 5294 + }, + { + "epoch": 0.331, + "grad_norm": 2.21875, + "grad_norm_var": 0.007743326822916666, + "learning_rate": 0.0001, + "loss": 7.3962, + "loss/crossentropy": 2.3068933486938477, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21767152845859528, + "step": 5296 + }, + { + "epoch": 0.331125, + "grad_norm": 2.328125, + "grad_norm_var": 0.01090087890625, + "learning_rate": 0.0001, + "loss": 7.1498, + "loss/crossentropy": 2.1658458709716797, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.20870249718427658, + "step": 5298 + }, + { + "epoch": 0.33125, + "grad_norm": 2.125, + "grad_norm_var": 0.011454264322916666, + "learning_rate": 0.0001, + "loss": 7.2139, + "loss/crossentropy": 2.022110342979431, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19558995962142944, + "step": 5300 + }, + { + "epoch": 0.331375, + "grad_norm": 2.453125, + "grad_norm_var": 0.014387003580729167, + "learning_rate": 0.0001, + "loss": 7.3322, + "loss/crossentropy": 2.1405563354492188, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20309502631425858, + "step": 5302 + }, + { + "epoch": 0.3315, + "grad_norm": 2.1875, + "grad_norm_var": 0.015641276041666666, + "learning_rate": 0.0001, + "loss": 7.3417, + "loss/crossentropy": 2.1956971883773804, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20331468433141708, + "step": 5304 + }, + { + "epoch": 0.331625, + "grad_norm": 2.234375, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 7.3134, + "loss/crossentropy": 2.3754764795303345, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.23374854773283005, + "step": 5306 + }, + { + "epoch": 0.33175, + "grad_norm": 2.359375, + "grad_norm_var": 0.016695149739583335, + "learning_rate": 0.0001, + "loss": 7.2282, + "loss/crossentropy": 2.068848133087158, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.19378670305013657, + "step": 5308 + }, + { + "epoch": 0.331875, + "grad_norm": 2.046875, + "grad_norm_var": 0.019205729166666668, + "learning_rate": 0.0001, + "loss": 7.1813, + "loss/crossentropy": 2.3996471166610718, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2248714193701744, + "step": 5310 + }, + { + "epoch": 0.332, + "grad_norm": 2.140625, + "grad_norm_var": 0.020052083333333335, + "learning_rate": 0.0001, + "loss": 7.3708, + "loss/crossentropy": 2.327105164527893, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2335279881954193, + "step": 5312 + }, + { + "epoch": 0.332125, + "grad_norm": 2.203125, + "grad_norm_var": 0.015478515625, + "learning_rate": 0.0001, + "loss": 7.3023, + "loss/crossentropy": 2.254124402999878, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21904382854700089, + "step": 5314 + }, + { + "epoch": 0.33225, + "grad_norm": 2.125, + "grad_norm_var": 0.0150054931640625, + "learning_rate": 0.0001, + "loss": 7.2638, + "loss/crossentropy": 2.242727756500244, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23604810237884521, + "step": 5316 + }, + { + "epoch": 0.332375, + "grad_norm": 2.21875, + "grad_norm_var": 0.0114898681640625, + "learning_rate": 0.0001, + "loss": 7.3476, + "loss/crossentropy": 2.1725869178771973, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22862936556339264, + "step": 5318 + }, + { + "epoch": 0.3325, + "grad_norm": 2.4375, + "grad_norm_var": 0.0150390625, + "learning_rate": 0.0001, + "loss": 7.4291, + "loss/crossentropy": 2.289436936378479, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2235860824584961, + "step": 5320 + }, + { + "epoch": 0.332625, + "grad_norm": 2.03125, + "grad_norm_var": 0.01962890625, + "learning_rate": 0.0001, + "loss": 7.1351, + "loss/crossentropy": 2.306610345840454, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2362884059548378, + "step": 5322 + }, + { + "epoch": 0.33275, + "grad_norm": 2.15625, + "grad_norm_var": 0.015583292643229166, + "learning_rate": 0.0001, + "loss": 7.3527, + "loss/crossentropy": 2.2097585201263428, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.24638129770755768, + "step": 5324 + }, + { + "epoch": 0.332875, + "grad_norm": 2.296875, + "grad_norm_var": 0.015462239583333334, + "learning_rate": 0.0001, + "loss": 7.3412, + "loss/crossentropy": 2.1767340898513794, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2145010083913803, + "step": 5326 + }, + { + "epoch": 0.333, + "grad_norm": 2.125, + "grad_norm_var": 0.0224029541015625, + "learning_rate": 0.0001, + "loss": 7.2168, + "loss/crossentropy": 2.1483041048049927, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21507804840803146, + "step": 5328 + }, + { + "epoch": 0.333125, + "grad_norm": 2.421875, + "grad_norm_var": 0.024930826822916665, + "learning_rate": 0.0001, + "loss": 7.4416, + "loss/crossentropy": 2.1952147483825684, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.21811091899871826, + "step": 5330 + }, + { + "epoch": 0.33325, + "grad_norm": 2.40625, + "grad_norm_var": 0.0267730712890625, + "learning_rate": 0.0001, + "loss": 7.4886, + "loss/crossentropy": 2.136277675628662, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2098674327135086, + "step": 5332 + }, + { + "epoch": 0.333375, + "grad_norm": 2.109375, + "grad_norm_var": 0.027962239583333333, + "learning_rate": 0.0001, + "loss": 7.187, + "loss/crossentropy": 2.1514610052108765, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2044997215270996, + "step": 5334 + }, + { + "epoch": 0.3335, + "grad_norm": 2.46875, + "grad_norm_var": 0.026610310872395834, + "learning_rate": 0.0001, + "loss": 7.2754, + "loss/crossentropy": 2.2177401781082153, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.202798530459404, + "step": 5336 + }, + { + "epoch": 0.333625, + "grad_norm": 2.09375, + "grad_norm_var": 0.021239217122395834, + "learning_rate": 0.0001, + "loss": 7.1166, + "loss/crossentropy": 2.191626250743866, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21423222124576569, + "step": 5338 + }, + { + "epoch": 0.33375, + "grad_norm": 2.484375, + "grad_norm_var": 0.0244293212890625, + "learning_rate": 0.0001, + "loss": 7.28, + "loss/crossentropy": 2.3615646362304688, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22018377482891083, + "step": 5340 + }, + { + "epoch": 0.333875, + "grad_norm": 2.4375, + "grad_norm_var": 0.029899088541666667, + "learning_rate": 0.0001, + "loss": 7.401, + "loss/crossentropy": 2.231188416481018, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22484582662582397, + "step": 5342 + }, + { + "epoch": 0.334, + "grad_norm": 2.140625, + "grad_norm_var": 0.026090494791666665, + "learning_rate": 0.0001, + "loss": 7.3716, + "loss/crossentropy": 2.2002989053726196, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21874640136957169, + "step": 5344 + }, + { + "epoch": 0.334125, + "grad_norm": 2.484375, + "grad_norm_var": 0.0256744384765625, + "learning_rate": 0.0001, + "loss": 7.4587, + "loss/crossentropy": 2.4450970888137817, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21748078614473343, + "step": 5346 + }, + { + "epoch": 0.33425, + "grad_norm": 3.015625, + "grad_norm_var": 0.057673136393229164, + "learning_rate": 0.0001, + "loss": 7.2319, + "loss/crossentropy": 2.109615385532379, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.19727355241775513, + "step": 5348 + }, + { + "epoch": 0.334375, + "grad_norm": 2.5625, + "grad_norm_var": 0.052783203125, + "learning_rate": 0.0001, + "loss": 7.6111, + "loss/crossentropy": 2.1436294317245483, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22486217319965363, + "step": 5350 + }, + { + "epoch": 0.3345, + "grad_norm": 2.296875, + "grad_norm_var": 0.05257059733072917, + "learning_rate": 0.0001, + "loss": 7.2229, + "loss/crossentropy": 2.407299041748047, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23616841435432434, + "step": 5352 + }, + { + "epoch": 0.334625, + "grad_norm": 2.328125, + "grad_norm_var": 0.04553629557291667, + "learning_rate": 0.0001, + "loss": 7.2801, + "loss/crossentropy": 2.192195415496826, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22064944356679916, + "step": 5354 + }, + { + "epoch": 0.33475, + "grad_norm": 2.3125, + "grad_norm_var": 0.04217122395833333, + "learning_rate": 0.0001, + "loss": 7.4254, + "loss/crossentropy": 2.1264270544052124, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2075883150100708, + "step": 5356 + }, + { + "epoch": 0.334875, + "grad_norm": 2.171875, + "grad_norm_var": 0.0422760009765625, + "learning_rate": 0.0001, + "loss": 7.2029, + "loss/crossentropy": 2.1121798753738403, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21974780410528183, + "step": 5358 + }, + { + "epoch": 0.335, + "grad_norm": 2.3125, + "grad_norm_var": 0.0387847900390625, + "learning_rate": 0.0001, + "loss": 7.4508, + "loss/crossentropy": 2.2427613735198975, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21257954835891724, + "step": 5360 + }, + { + "epoch": 0.335125, + "grad_norm": 2.25, + "grad_norm_var": 0.04039713541666667, + "learning_rate": 0.0001, + "loss": 7.4028, + "loss/crossentropy": 2.186354875564575, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.20210053771734238, + "step": 5362 + }, + { + "epoch": 0.33525, + "grad_norm": 2.1875, + "grad_norm_var": 0.012723795572916667, + "learning_rate": 0.0001, + "loss": 7.2075, + "loss/crossentropy": 2.155713438987732, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.203889898955822, + "step": 5364 + }, + { + "epoch": 0.335375, + "grad_norm": 2.265625, + "grad_norm_var": 0.008805338541666667, + "learning_rate": 0.0001, + "loss": 7.4549, + "loss/crossentropy": 2.2281126976013184, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22180020064115524, + "step": 5366 + }, + { + "epoch": 0.3355, + "grad_norm": 2.28125, + "grad_norm_var": 0.00972900390625, + "learning_rate": 0.0001, + "loss": 7.4261, + "loss/crossentropy": 2.3512368202209473, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22407396882772446, + "step": 5368 + }, + { + "epoch": 0.335625, + "grad_norm": 2.1875, + "grad_norm_var": 0.01324462890625, + "learning_rate": 0.0001, + "loss": 7.2495, + "loss/crossentropy": 2.2067723274230957, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2186514213681221, + "step": 5370 + }, + { + "epoch": 0.33575, + "grad_norm": 2.09375, + "grad_norm_var": 0.01881103515625, + "learning_rate": 0.0001, + "loss": 7.1389, + "loss/crossentropy": 2.170414924621582, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20364930480718613, + "step": 5372 + }, + { + "epoch": 0.335875, + "grad_norm": 2.1875, + "grad_norm_var": 0.0207183837890625, + "learning_rate": 0.0001, + "loss": 7.1763, + "loss/crossentropy": 2.1207195520401, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.19605641067028046, + "step": 5374 + }, + { + "epoch": 0.336, + "grad_norm": 2.703125, + "grad_norm_var": 0.03235270182291667, + "learning_rate": 0.0001, + "loss": 7.358, + "loss/crossentropy": 2.5215905904769897, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.23153279721736908, + "step": 5376 + }, + { + "epoch": 0.336125, + "grad_norm": 2.21875, + "grad_norm_var": 0.0276519775390625, + "learning_rate": 0.0001, + "loss": 7.3453, + "loss/crossentropy": 2.305363655090332, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21856709569692612, + "step": 5378 + }, + { + "epoch": 0.33625, + "grad_norm": 2.25, + "grad_norm_var": 0.0273101806640625, + "learning_rate": 0.0001, + "loss": 7.3475, + "loss/crossentropy": 1.9425964951515198, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20577477663755417, + "step": 5380 + }, + { + "epoch": 0.336375, + "grad_norm": 2.171875, + "grad_norm_var": 0.028123982747395835, + "learning_rate": 0.0001, + "loss": 7.4189, + "loss/crossentropy": 2.2299511432647705, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.26048048585653305, + "step": 5382 + }, + { + "epoch": 0.3365, + "grad_norm": 2.28125, + "grad_norm_var": 0.027831013997395834, + "learning_rate": 0.0001, + "loss": 7.424, + "loss/crossentropy": 2.1312129497528076, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21301250159740448, + "step": 5384 + }, + { + "epoch": 0.336625, + "grad_norm": 2.21875, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 7.2993, + "loss/crossentropy": 2.1534899473190308, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23556435108184814, + "step": 5386 + }, + { + "epoch": 0.33675, + "grad_norm": 2.296875, + "grad_norm_var": 0.019270833333333334, + "learning_rate": 0.0001, + "loss": 7.3045, + "loss/crossentropy": 2.2635024189949036, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2051224410533905, + "step": 5388 + }, + { + "epoch": 0.336875, + "grad_norm": 2.15625, + "grad_norm_var": 0.017317708333333334, + "learning_rate": 0.0001, + "loss": 7.3403, + "loss/crossentropy": 2.6195231676101685, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.24129101634025574, + "step": 5390 + }, + { + "epoch": 0.337, + "grad_norm": 2.578125, + "grad_norm_var": 0.010285441080729167, + "learning_rate": 0.0001, + "loss": 7.2644, + "loss/crossentropy": 2.0847758054733276, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.20890209078788757, + "step": 5392 + }, + { + "epoch": 0.337125, + "grad_norm": 2.15625, + "grad_norm_var": 0.017308553059895832, + "learning_rate": 0.0001, + "loss": 7.4097, + "loss/crossentropy": 2.41829776763916, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.23831215500831604, + "step": 5394 + }, + { + "epoch": 0.33725, + "grad_norm": 2.40625, + "grad_norm_var": 0.018285115559895832, + "learning_rate": 0.0001, + "loss": 7.3525, + "loss/crossentropy": 2.169269323348999, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.21398558467626572, + "step": 5396 + }, + { + "epoch": 0.337375, + "grad_norm": 2.15625, + "grad_norm_var": 0.018424479166666667, + "learning_rate": 0.0001, + "loss": 7.3212, + "loss/crossentropy": 2.198991537094116, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21486544609069824, + "step": 5398 + }, + { + "epoch": 0.3375, + "grad_norm": 2.15625, + "grad_norm_var": 0.019562784830729166, + "learning_rate": 0.0001, + "loss": 7.1218, + "loss/crossentropy": 2.184316039085388, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20338526368141174, + "step": 5400 + }, + { + "epoch": 0.337625, + "grad_norm": 2.328125, + "grad_norm_var": 0.018290201822916668, + "learning_rate": 0.0001, + "loss": 7.321, + "loss/crossentropy": 2.111461341381073, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21006064862012863, + "step": 5402 + }, + { + "epoch": 0.33775, + "grad_norm": 2.203125, + "grad_norm_var": 0.019115193684895834, + "learning_rate": 0.0001, + "loss": 7.3327, + "loss/crossentropy": 2.301071524620056, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21048437803983688, + "step": 5404 + }, + { + "epoch": 0.337875, + "grad_norm": 2.21875, + "grad_norm_var": 0.0183746337890625, + "learning_rate": 0.0001, + "loss": 7.2345, + "loss/crossentropy": 2.2816654443740845, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22754280269145966, + "step": 5406 + }, + { + "epoch": 0.338, + "grad_norm": 2.1875, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 7.2538, + "loss/crossentropy": 2.3284599781036377, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23177867382764816, + "step": 5408 + }, + { + "epoch": 0.338125, + "grad_norm": 2.21875, + "grad_norm_var": 0.006266276041666667, + "learning_rate": 0.0001, + "loss": 7.1344, + "loss/crossentropy": 2.009117007255554, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.195498988032341, + "step": 5410 + }, + { + "epoch": 0.33825, + "grad_norm": 2.234375, + "grad_norm_var": 0.005143229166666667, + "learning_rate": 0.0001, + "loss": 7.3075, + "loss/crossentropy": 2.0180088877677917, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.22797267884016037, + "step": 5412 + }, + { + "epoch": 0.338375, + "grad_norm": 2.265625, + "grad_norm_var": 0.004736328125, + "learning_rate": 0.0001, + "loss": 7.3005, + "loss/crossentropy": 2.1382123231887817, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2192755788564682, + "step": 5414 + }, + { + "epoch": 0.3385, + "grad_norm": 2.203125, + "grad_norm_var": 0.004325358072916666, + "learning_rate": 0.0001, + "loss": 7.3033, + "loss/crossentropy": 2.408115029335022, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2371460422873497, + "step": 5416 + }, + { + "epoch": 0.338625, + "grad_norm": 2.859375, + "grad_norm_var": 0.16881103515625, + "learning_rate": 0.0001, + "loss": 7.3914, + "loss/crossentropy": 2.1764304637908936, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20545508712530136, + "step": 5418 + }, + { + "epoch": 0.33875, + "grad_norm": 2.359375, + "grad_norm_var": 0.16660054524739584, + "learning_rate": 0.0001, + "loss": 7.305, + "loss/crossentropy": 2.1204302310943604, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2243422120809555, + "step": 5420 + }, + { + "epoch": 0.338875, + "grad_norm": 2.484375, + "grad_norm_var": 0.16787821451822918, + "learning_rate": 0.0001, + "loss": 7.395, + "loss/crossentropy": 2.1409205198287964, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20757804811000824, + "step": 5422 + }, + { + "epoch": 0.339, + "grad_norm": 2.109375, + "grad_norm_var": 0.17183837890625, + "learning_rate": 0.0001, + "loss": 7.239, + "loss/crossentropy": 2.3405885696411133, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2265135422348976, + "step": 5424 + }, + { + "epoch": 0.339125, + "grad_norm": 2.265625, + "grad_norm_var": 0.17040608723958334, + "learning_rate": 0.0001, + "loss": 7.305, + "loss/crossentropy": 2.0772798657417297, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.24080167710781097, + "step": 5426 + }, + { + "epoch": 0.33925, + "grad_norm": 2.265625, + "grad_norm_var": 0.16744791666666667, + "learning_rate": 0.0001, + "loss": 7.3626, + "loss/crossentropy": 2.3165992498397827, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21086010336875916, + "step": 5428 + }, + { + "epoch": 0.339375, + "grad_norm": 2.25, + "grad_norm_var": 0.16593424479166666, + "learning_rate": 0.0001, + "loss": 7.2824, + "loss/crossentropy": 2.2697300910949707, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.216596320271492, + "step": 5430 + }, + { + "epoch": 0.3395, + "grad_norm": 2.09375, + "grad_norm_var": 0.1771636962890625, + "learning_rate": 0.0001, + "loss": 7.1865, + "loss/crossentropy": 2.3810267448425293, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2208058461546898, + "step": 5432 + }, + { + "epoch": 0.339625, + "grad_norm": 2.125, + "grad_norm_var": 0.027860514322916665, + "learning_rate": 0.0001, + "loss": 7.2512, + "loss/crossentropy": 2.255163311958313, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.205363892018795, + "step": 5434 + }, + { + "epoch": 0.33975, + "grad_norm": 2.484375, + "grad_norm_var": 0.0278717041015625, + "learning_rate": 0.0001, + "loss": 7.2184, + "loss/crossentropy": 2.0902993083000183, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2042091339826584, + "step": 5436 + }, + { + "epoch": 0.339875, + "grad_norm": 2.078125, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 7.3953, + "loss/crossentropy": 2.379234194755554, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22250881046056747, + "step": 5438 + }, + { + "epoch": 0.34, + "grad_norm": 2.171875, + "grad_norm_var": 0.015120442708333333, + "learning_rate": 0.0001, + "loss": 7.1556, + "loss/crossentropy": 2.1854381561279297, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20145095884799957, + "step": 5440 + }, + { + "epoch": 0.340125, + "grad_norm": 2.25, + "grad_norm_var": 0.015453084309895834, + "learning_rate": 0.0001, + "loss": 7.3506, + "loss/crossentropy": 2.4397945404052734, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21113866567611694, + "step": 5442 + }, + { + "epoch": 0.34025, + "grad_norm": 2.0, + "grad_norm_var": 0.019580078125, + "learning_rate": 0.0001, + "loss": 7.2824, + "loss/crossentropy": 2.2910887002944946, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22225283086299896, + "step": 5444 + }, + { + "epoch": 0.340375, + "grad_norm": 2.1875, + "grad_norm_var": 0.022215779622395834, + "learning_rate": 0.0001, + "loss": 7.4318, + "loss/crossentropy": 2.4982157945632935, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23762008547782898, + "step": 5446 + }, + { + "epoch": 0.3405, + "grad_norm": 2.34375, + "grad_norm_var": 0.03228759765625, + "learning_rate": 0.0001, + "loss": 7.5355, + "loss/crossentropy": 2.3466309309005737, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2286328449845314, + "step": 5448 + }, + { + "epoch": 0.340625, + "grad_norm": 2.109375, + "grad_norm_var": 0.033426920572916664, + "learning_rate": 0.0001, + "loss": 7.4912, + "loss/crossentropy": 2.5358729362487793, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.24695628136396408, + "step": 5450 + }, + { + "epoch": 0.34075, + "grad_norm": 2.671875, + "grad_norm_var": 0.03862202962239583, + "learning_rate": 0.0001, + "loss": 7.5367, + "loss/crossentropy": 2.3047229051589966, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.21712347865104675, + "step": 5452 + }, + { + "epoch": 0.340875, + "grad_norm": 2.09375, + "grad_norm_var": 0.038386027018229164, + "learning_rate": 0.0001, + "loss": 7.119, + "loss/crossentropy": 2.1131972074508667, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21644818782806396, + "step": 5454 + }, + { + "epoch": 0.341, + "grad_norm": 2.484375, + "grad_norm_var": 0.042780558268229164, + "learning_rate": 0.0001, + "loss": 7.4856, + "loss/crossentropy": 2.1927385330200195, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23934345692396164, + "step": 5456 + }, + { + "epoch": 0.341125, + "grad_norm": 2.390625, + "grad_norm_var": 0.04208577473958333, + "learning_rate": 0.0001, + "loss": 7.5131, + "loss/crossentropy": 2.1948903799057007, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2262982353568077, + "step": 5458 + }, + { + "epoch": 0.34125, + "grad_norm": 2.359375, + "grad_norm_var": 0.03450419108072917, + "learning_rate": 0.0001, + "loss": 7.2735, + "loss/crossentropy": 2.2074116468429565, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21651601791381836, + "step": 5460 + }, + { + "epoch": 0.341375, + "grad_norm": 2.65625, + "grad_norm_var": 0.4347320556640625, + "learning_rate": 0.0001, + "loss": 7.6036, + "loss/crossentropy": 2.3377386331558228, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.260085329413414, + "step": 5462 + }, + { + "epoch": 0.3415, + "grad_norm": 2.21875, + "grad_norm_var": 0.43775634765625, + "learning_rate": 0.0001, + "loss": 7.3691, + "loss/crossentropy": 2.5194804668426514, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2196458876132965, + "step": 5464 + }, + { + "epoch": 0.341625, + "grad_norm": 2.140625, + "grad_norm_var": 0.43778889973958335, + "learning_rate": 0.0001, + "loss": 7.3684, + "loss/crossentropy": 2.222210168838501, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20729803293943405, + "step": 5466 + }, + { + "epoch": 0.34175, + "grad_norm": 2.1875, + "grad_norm_var": 0.44607645670572915, + "learning_rate": 0.0001, + "loss": 7.5481, + "loss/crossentropy": 2.0169124603271484, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22089700400829315, + "step": 5468 + }, + { + "epoch": 0.341875, + "grad_norm": 2.34375, + "grad_norm_var": 0.4344146728515625, + "learning_rate": 0.0001, + "loss": 7.3955, + "loss/crossentropy": 2.1813031435012817, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2193674072623253, + "step": 5470 + }, + { + "epoch": 0.342, + "grad_norm": 2.0, + "grad_norm_var": 0.45690816243489585, + "learning_rate": 0.0001, + "loss": 7.2806, + "loss/crossentropy": 2.2984254360198975, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.22460055351257324, + "step": 5472 + }, + { + "epoch": 0.342125, + "grad_norm": 2.3125, + "grad_norm_var": 0.46097005208333336, + "learning_rate": 0.0001, + "loss": 7.3134, + "loss/crossentropy": 2.1755064725875854, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.21521113812923431, + "step": 5474 + }, + { + "epoch": 0.34225, + "grad_norm": 2.1875, + "grad_norm_var": 0.46314697265625, + "learning_rate": 0.0001, + "loss": 7.1628, + "loss/crossentropy": 2.117392897605896, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19606874883174896, + "step": 5476 + }, + { + "epoch": 0.342375, + "grad_norm": 2.3125, + "grad_norm_var": 0.022516886393229168, + "learning_rate": 0.0001, + "loss": 7.2474, + "loss/crossentropy": 2.127562403678894, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21586275100708008, + "step": 5478 + }, + { + "epoch": 0.3425, + "grad_norm": 2.609375, + "grad_norm_var": 0.027958170572916666, + "learning_rate": 0.0001, + "loss": 7.4147, + "loss/crossentropy": 2.201394200325012, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21986886113882065, + "step": 5480 + }, + { + "epoch": 0.342625, + "grad_norm": 2.03125, + "grad_norm_var": 0.03655192057291667, + "learning_rate": 0.0001, + "loss": 7.2027, + "loss/crossentropy": 2.0617064237594604, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22360235452651978, + "step": 5482 + }, + { + "epoch": 0.34275, + "grad_norm": 2.109375, + "grad_norm_var": 0.03845926920572917, + "learning_rate": 0.0001, + "loss": 7.3388, + "loss/crossentropy": 2.266227602958679, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21536527574062347, + "step": 5484 + }, + { + "epoch": 0.342875, + "grad_norm": 2.453125, + "grad_norm_var": 0.034012858072916666, + "learning_rate": 0.0001, + "loss": 7.2759, + "loss/crossentropy": 2.3090654611587524, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2186703011393547, + "step": 5486 + }, + { + "epoch": 0.343, + "grad_norm": 2.234375, + "grad_norm_var": 0.028180948893229165, + "learning_rate": 0.0001, + "loss": 7.4189, + "loss/crossentropy": 2.4446980953216553, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.24092496931552887, + "step": 5488 + }, + { + "epoch": 0.343125, + "grad_norm": 2.140625, + "grad_norm_var": 0.0316314697265625, + "learning_rate": 0.0001, + "loss": 7.3887, + "loss/crossentropy": 2.301460862159729, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2261280044913292, + "step": 5490 + }, + { + "epoch": 0.34325, + "grad_norm": 2.40625, + "grad_norm_var": 0.031012980143229167, + "learning_rate": 0.0001, + "loss": 7.2817, + "loss/crossentropy": 2.375279426574707, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22997137159109116, + "step": 5492 + }, + { + "epoch": 0.343375, + "grad_norm": 2.140625, + "grad_norm_var": 0.03375244140625, + "learning_rate": 0.0001, + "loss": 7.3397, + "loss/crossentropy": 2.174167513847351, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22003310173749924, + "step": 5494 + }, + { + "epoch": 0.3435, + "grad_norm": 2.296875, + "grad_norm_var": 0.02275390625, + "learning_rate": 0.0001, + "loss": 7.2999, + "loss/crossentropy": 2.264827609062195, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22024701535701752, + "step": 5496 + }, + { + "epoch": 0.343625, + "grad_norm": 2.078125, + "grad_norm_var": 0.0152740478515625, + "learning_rate": 0.0001, + "loss": 7.2387, + "loss/crossentropy": 2.186145544052124, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21280007809400558, + "step": 5498 + }, + { + "epoch": 0.34375, + "grad_norm": 2.5625, + "grad_norm_var": 0.01982421875, + "learning_rate": 0.0001, + "loss": 7.3772, + "loss/crossentropy": 2.481659770011902, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2214002087712288, + "step": 5500 + }, + { + "epoch": 0.343875, + "grad_norm": 2.125, + "grad_norm_var": 0.019367472330729166, + "learning_rate": 0.0001, + "loss": 7.3132, + "loss/crossentropy": 2.3691645860671997, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.23550476133823395, + "step": 5502 + }, + { + "epoch": 0.344, + "grad_norm": 2.234375, + "grad_norm_var": 0.01949462890625, + "learning_rate": 0.0001, + "loss": 7.2032, + "loss/crossentropy": 2.260165572166443, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23349297046661377, + "step": 5504 + }, + { + "epoch": 0.344125, + "grad_norm": 2.109375, + "grad_norm_var": 0.01617431640625, + "learning_rate": 0.0001, + "loss": 7.3419, + "loss/crossentropy": 2.2282174825668335, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22737015783786774, + "step": 5506 + }, + { + "epoch": 0.34425, + "grad_norm": 2.578125, + "grad_norm_var": 0.021663411458333334, + "learning_rate": 0.0001, + "loss": 7.2331, + "loss/crossentropy": 2.3071209192276, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.23168734461069107, + "step": 5508 + }, + { + "epoch": 0.344375, + "grad_norm": 2.296875, + "grad_norm_var": 0.032013956705729166, + "learning_rate": 0.0001, + "loss": 7.5768, + "loss/crossentropy": 2.337808847427368, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.22684383392333984, + "step": 5510 + }, + { + "epoch": 0.3445, + "grad_norm": 2.125, + "grad_norm_var": 0.03868815104166667, + "learning_rate": 0.0001, + "loss": 7.1271, + "loss/crossentropy": 2.21714323759079, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2065524309873581, + "step": 5512 + }, + { + "epoch": 0.344625, + "grad_norm": 2.46875, + "grad_norm_var": 0.03770243326822917, + "learning_rate": 0.0001, + "loss": 7.5043, + "loss/crossentropy": 2.3815321922302246, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22905410081148148, + "step": 5514 + }, + { + "epoch": 0.34475, + "grad_norm": 2.21875, + "grad_norm_var": 0.03234049479166667, + "learning_rate": 0.0001, + "loss": 7.2816, + "loss/crossentropy": 2.4301551580429077, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2220115140080452, + "step": 5516 + }, + { + "epoch": 0.344875, + "grad_norm": 2.28125, + "grad_norm_var": 0.03194986979166667, + "learning_rate": 0.0001, + "loss": 7.2628, + "loss/crossentropy": 2.404889702796936, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21883364766836166, + "step": 5518 + }, + { + "epoch": 0.345, + "grad_norm": 2.21875, + "grad_norm_var": 0.03205464680989583, + "learning_rate": 0.0001, + "loss": 7.3638, + "loss/crossentropy": 2.228495955467224, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2163659930229187, + "step": 5520 + }, + { + "epoch": 0.345125, + "grad_norm": 2.234375, + "grad_norm_var": 0.029816691080729166, + "learning_rate": 0.0001, + "loss": 7.1626, + "loss/crossentropy": 2.102945566177368, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20724687725305557, + "step": 5522 + }, + { + "epoch": 0.34525, + "grad_norm": 2.140625, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 7.3196, + "loss/crossentropy": 2.291762113571167, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2216021940112114, + "step": 5524 + }, + { + "epoch": 0.345375, + "grad_norm": 2.140625, + "grad_norm_var": 0.014644368489583334, + "learning_rate": 0.0001, + "loss": 7.2901, + "loss/crossentropy": 2.3197373151779175, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217041775584221, + "step": 5526 + }, + { + "epoch": 0.3455, + "grad_norm": 2.171875, + "grad_norm_var": 0.009891764322916666, + "learning_rate": 0.0001, + "loss": 7.1251, + "loss/crossentropy": 2.16168212890625, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2141154482960701, + "step": 5528 + }, + { + "epoch": 0.345625, + "grad_norm": 2.359375, + "grad_norm_var": 0.01103515625, + "learning_rate": 0.0001, + "loss": 7.2774, + "loss/crossentropy": 2.3330233097076416, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2168646603822708, + "step": 5530 + }, + { + "epoch": 0.34575, + "grad_norm": 2.703125, + "grad_norm_var": 0.025951131184895834, + "learning_rate": 0.0001, + "loss": 7.4739, + "loss/crossentropy": 2.0444337129592896, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2103881537914276, + "step": 5532 + }, + { + "epoch": 0.345875, + "grad_norm": 2.359375, + "grad_norm_var": 0.025446573893229168, + "learning_rate": 0.0001, + "loss": 7.4652, + "loss/crossentropy": 2.315095543861389, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2205868437886238, + "step": 5534 + }, + { + "epoch": 0.346, + "grad_norm": 2.3125, + "grad_norm_var": 0.024323527018229166, + "learning_rate": 0.0001, + "loss": 7.2638, + "loss/crossentropy": 2.2340970039367676, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21409018337726593, + "step": 5536 + }, + { + "epoch": 0.346125, + "grad_norm": 2.078125, + "grad_norm_var": 0.027741495768229166, + "learning_rate": 0.0001, + "loss": 7.2674, + "loss/crossentropy": 2.30459725856781, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22205013036727905, + "step": 5538 + }, + { + "epoch": 0.34625, + "grad_norm": 2.203125, + "grad_norm_var": 0.030475870768229166, + "learning_rate": 0.0001, + "loss": 7.1644, + "loss/crossentropy": 2.048761546611786, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19404233247041702, + "step": 5540 + }, + { + "epoch": 0.346375, + "grad_norm": 2.28125, + "grad_norm_var": 0.028515625, + "learning_rate": 0.0001, + "loss": 7.2601, + "loss/crossentropy": 2.1471662521362305, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2048891857266426, + "step": 5542 + }, + { + "epoch": 0.3465, + "grad_norm": 2.328125, + "grad_norm_var": 0.027274576822916667, + "learning_rate": 0.0001, + "loss": 7.0981, + "loss/crossentropy": 2.147502064704895, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21818849444389343, + "step": 5544 + }, + { + "epoch": 0.346625, + "grad_norm": 2.765625, + "grad_norm_var": 0.0386627197265625, + "learning_rate": 0.0001, + "loss": 7.2796, + "loss/crossentropy": 2.213876247406006, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22943131625652313, + "step": 5546 + }, + { + "epoch": 0.34675, + "grad_norm": 2.5, + "grad_norm_var": 0.028055826822916668, + "learning_rate": 0.0001, + "loss": 7.3104, + "loss/crossentropy": 2.327598452568054, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22541771829128265, + "step": 5548 + }, + { + "epoch": 0.346875, + "grad_norm": 2.171875, + "grad_norm_var": 0.031083170572916666, + "learning_rate": 0.0001, + "loss": 7.2545, + "loss/crossentropy": 2.065271317958832, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2102024182677269, + "step": 5550 + }, + { + "epoch": 0.347, + "grad_norm": 2.15625, + "grad_norm_var": 0.031769816080729166, + "learning_rate": 0.0001, + "loss": 7.2822, + "loss/crossentropy": 2.241680383682251, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.24006665498018265, + "step": 5552 + }, + { + "epoch": 0.347125, + "grad_norm": 2.25, + "grad_norm_var": 0.029816691080729166, + "learning_rate": 0.0001, + "loss": 7.2535, + "loss/crossentropy": 1.9361643195152283, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.193950355052948, + "step": 5554 + }, + { + "epoch": 0.34725, + "grad_norm": 2.515625, + "grad_norm_var": 0.029618326822916666, + "learning_rate": 0.0001, + "loss": 7.2865, + "loss/crossentropy": 2.2774378061294556, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2135343924164772, + "step": 5556 + }, + { + "epoch": 0.347375, + "grad_norm": 2.1875, + "grad_norm_var": 0.04593098958333333, + "learning_rate": 0.0001, + "loss": 7.2553, + "loss/crossentropy": 2.092591166496277, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21611423045396805, + "step": 5558 + }, + { + "epoch": 0.3475, + "grad_norm": 2.390625, + "grad_norm_var": 0.04534098307291667, + "learning_rate": 0.0001, + "loss": 7.4553, + "loss/crossentropy": 2.4065760374069214, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21628361195325851, + "step": 5560 + }, + { + "epoch": 0.347625, + "grad_norm": 2.09375, + "grad_norm_var": 0.03795166015625, + "learning_rate": 0.0001, + "loss": 7.3378, + "loss/crossentropy": 2.1736042499542236, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21986880898475647, + "step": 5562 + }, + { + "epoch": 0.34775, + "grad_norm": 2.109375, + "grad_norm_var": 0.03855692545572917, + "learning_rate": 0.0001, + "loss": 7.1797, + "loss/crossentropy": 2.377760887145996, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.20999367535114288, + "step": 5564 + }, + { + "epoch": 0.347875, + "grad_norm": 2.4375, + "grad_norm_var": 0.03753255208333333, + "learning_rate": 0.0001, + "loss": 7.2805, + "loss/crossentropy": 2.4437506198883057, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2216576337814331, + "step": 5566 + }, + { + "epoch": 0.348, + "grad_norm": 2.03125, + "grad_norm_var": 0.04220377604166667, + "learning_rate": 0.0001, + "loss": 7.1347, + "loss/crossentropy": 2.3025336265563965, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21155836433172226, + "step": 5568 + }, + { + "epoch": 0.348125, + "grad_norm": 2.265625, + "grad_norm_var": 0.042902628580729164, + "learning_rate": 0.0001, + "loss": 7.2856, + "loss/crossentropy": 2.147992491722107, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20805560052394867, + "step": 5570 + }, + { + "epoch": 0.34825, + "grad_norm": 2.125, + "grad_norm_var": 0.037495930989583336, + "learning_rate": 0.0001, + "loss": 7.2576, + "loss/crossentropy": 2.0868008732795715, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2108987644314766, + "step": 5572 + }, + { + "epoch": 0.348375, + "grad_norm": 2.21875, + "grad_norm_var": 0.014159138997395833, + "learning_rate": 0.0001, + "loss": 7.4734, + "loss/crossentropy": 2.1314677000045776, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22252144664525986, + "step": 5574 + }, + { + "epoch": 0.3485, + "grad_norm": 2.296875, + "grad_norm_var": 0.013472493489583333, + "learning_rate": 0.0001, + "loss": 7.3078, + "loss/crossentropy": 2.5350881814956665, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22396673262119293, + "step": 5576 + }, + { + "epoch": 0.348625, + "grad_norm": 2.21875, + "grad_norm_var": 0.016917928059895834, + "learning_rate": 0.0001, + "loss": 7.298, + "loss/crossentropy": 2.3182544708251953, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2111162766814232, + "step": 5578 + }, + { + "epoch": 0.34875, + "grad_norm": 2.34375, + "grad_norm_var": 0.015876261393229167, + "learning_rate": 0.0001, + "loss": 7.4597, + "loss/crossentropy": 2.1177011728286743, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2125827968120575, + "step": 5580 + }, + { + "epoch": 0.348875, + "grad_norm": 2.125, + "grad_norm_var": 0.013231404622395833, + "learning_rate": 0.0001, + "loss": 7.2578, + "loss/crossentropy": 2.2595534324645996, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20692522078752518, + "step": 5582 + }, + { + "epoch": 0.349, + "grad_norm": 2.265625, + "grad_norm_var": 0.009651692708333333, + "learning_rate": 0.0001, + "loss": 7.4041, + "loss/crossentropy": 2.2322030067443848, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2103104293346405, + "step": 5584 + }, + { + "epoch": 0.349125, + "grad_norm": 2.25, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 7.3692, + "loss/crossentropy": 2.342887282371521, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2122577279806137, + "step": 5586 + }, + { + "epoch": 0.34925, + "grad_norm": 2.265625, + "grad_norm_var": 0.0108062744140625, + "learning_rate": 0.0001, + "loss": 7.2918, + "loss/crossentropy": 2.1660179495811462, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21258103847503662, + "step": 5588 + }, + { + "epoch": 0.349375, + "grad_norm": 2.015625, + "grad_norm_var": 0.013411458333333333, + "learning_rate": 0.0001, + "loss": 7.2614, + "loss/crossentropy": 2.191131591796875, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.20719382166862488, + "step": 5590 + }, + { + "epoch": 0.3495, + "grad_norm": 2.296875, + "grad_norm_var": 0.013133748372395834, + "learning_rate": 0.0001, + "loss": 7.2375, + "loss/crossentropy": 2.0313998460769653, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2047080472111702, + "step": 5592 + }, + { + "epoch": 0.349625, + "grad_norm": 2.671875, + "grad_norm_var": 0.025048828125, + "learning_rate": 0.0001, + "loss": 7.3307, + "loss/crossentropy": 2.2927592992782593, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2102295085787773, + "step": 5594 + }, + { + "epoch": 0.34975, + "grad_norm": 2.171875, + "grad_norm_var": 0.02720947265625, + "learning_rate": 0.0001, + "loss": 7.2445, + "loss/crossentropy": 2.255831480026245, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22202551364898682, + "step": 5596 + }, + { + "epoch": 0.349875, + "grad_norm": 2.015625, + "grad_norm_var": 0.0329010009765625, + "learning_rate": 0.0001, + "loss": 7.1646, + "loss/crossentropy": 2.3973917961120605, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23418821394443512, + "step": 5598 + }, + { + "epoch": 0.35, + "grad_norm": 2.21875, + "grad_norm_var": 0.032746378580729166, + "learning_rate": 0.0001, + "loss": 7.2829, + "loss/crossentropy": 2.029415249824524, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.209575816988945, + "step": 5600 + }, + { + "epoch": 0.350125, + "grad_norm": 2.390625, + "grad_norm_var": 0.03385009765625, + "learning_rate": 0.0001, + "loss": 7.1971, + "loss/crossentropy": 2.4901596307754517, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23769868910312653, + "step": 5602 + }, + { + "epoch": 0.35025, + "grad_norm": 1.984375, + "grad_norm_var": 0.03967692057291667, + "learning_rate": 0.0001, + "loss": 7.1465, + "loss/crossentropy": 2.428591251373291, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22477445006370544, + "step": 5604 + }, + { + "epoch": 0.350375, + "grad_norm": 2.265625, + "grad_norm_var": 0.035888671875, + "learning_rate": 0.0001, + "loss": 7.1955, + "loss/crossentropy": 2.120119094848633, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2116844207048416, + "step": 5606 + }, + { + "epoch": 0.3505, + "grad_norm": 2.25, + "grad_norm_var": 0.03421223958333333, + "learning_rate": 0.0001, + "loss": 7.3445, + "loss/crossentropy": 2.4258477687835693, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2168346494436264, + "step": 5608 + }, + { + "epoch": 0.350625, + "grad_norm": 2.25, + "grad_norm_var": 0.020243326822916668, + "learning_rate": 0.0001, + "loss": 7.2439, + "loss/crossentropy": 2.255433678627014, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21720143407583237, + "step": 5610 + }, + { + "epoch": 0.35075, + "grad_norm": 2.109375, + "grad_norm_var": 0.019481404622395834, + "learning_rate": 0.0001, + "loss": 7.3014, + "loss/crossentropy": 2.3039766550064087, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20905648171901703, + "step": 5612 + }, + { + "epoch": 0.350875, + "grad_norm": 2.203125, + "grad_norm_var": 0.014872233072916666, + "learning_rate": 0.0001, + "loss": 7.2678, + "loss/crossentropy": 2.2413315773010254, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21247967332601547, + "step": 5614 + }, + { + "epoch": 0.351, + "grad_norm": 2.234375, + "grad_norm_var": 0.0152008056640625, + "learning_rate": 0.0001, + "loss": 7.4491, + "loss/crossentropy": 2.434108018875122, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.23025284707546234, + "step": 5616 + }, + { + "epoch": 0.351125, + "grad_norm": 2.265625, + "grad_norm_var": 0.013667805989583334, + "learning_rate": 0.0001, + "loss": 7.3377, + "loss/crossentropy": 2.3039416074752808, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23464076220989227, + "step": 5618 + }, + { + "epoch": 0.35125, + "grad_norm": 2.28125, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 7.1852, + "loss/crossentropy": 2.198891282081604, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20828387141227722, + "step": 5620 + }, + { + "epoch": 0.351375, + "grad_norm": 2.140625, + "grad_norm_var": 0.00660400390625, + "learning_rate": 0.0001, + "loss": 7.3417, + "loss/crossentropy": 1.9807387590408325, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.18464645743370056, + "step": 5622 + }, + { + "epoch": 0.3515, + "grad_norm": 2.328125, + "grad_norm_var": 0.007331339518229166, + "learning_rate": 0.0001, + "loss": 7.3484, + "loss/crossentropy": 2.1647424697875977, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20414948463439941, + "step": 5624 + }, + { + "epoch": 0.351625, + "grad_norm": 2.140625, + "grad_norm_var": 0.0063385009765625, + "learning_rate": 0.0001, + "loss": 7.3053, + "loss/crossentropy": 2.1328948736190796, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22070185095071793, + "step": 5626 + }, + { + "epoch": 0.35175, + "grad_norm": 2.046875, + "grad_norm_var": 0.0060699462890625, + "learning_rate": 0.0001, + "loss": 7.1396, + "loss/crossentropy": 2.041126549243927, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20997025072574615, + "step": 5628 + }, + { + "epoch": 0.351875, + "grad_norm": 2.171875, + "grad_norm_var": 0.006126912434895834, + "learning_rate": 0.0001, + "loss": 7.1444, + "loss/crossentropy": 2.0724629759788513, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20483851432800293, + "step": 5630 + }, + { + "epoch": 0.352, + "grad_norm": 2.390625, + "grad_norm_var": 0.007958984375, + "learning_rate": 0.0001, + "loss": 7.2539, + "loss/crossentropy": 2.167839527130127, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21613173931837082, + "step": 5632 + }, + { + "epoch": 0.352125, + "grad_norm": 2.078125, + "grad_norm_var": 0.013037109375, + "learning_rate": 0.0001, + "loss": 7.1433, + "loss/crossentropy": 2.105876088142395, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20869092643260956, + "step": 5634 + }, + { + "epoch": 0.35225, + "grad_norm": 2.234375, + "grad_norm_var": 0.012970987955729167, + "learning_rate": 0.0001, + "loss": 7.3603, + "loss/crossentropy": 2.1784520745277405, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20698366314172745, + "step": 5636 + }, + { + "epoch": 0.352375, + "grad_norm": 2.34375, + "grad_norm_var": 0.013947550455729167, + "learning_rate": 0.0001, + "loss": 7.3438, + "loss/crossentropy": 2.215299963951111, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21201211214065552, + "step": 5638 + }, + { + "epoch": 0.3525, + "grad_norm": 2.125, + "grad_norm_var": 0.013939412434895833, + "learning_rate": 0.0001, + "loss": 7.2268, + "loss/crossentropy": 2.0955730676651, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20698635280132294, + "step": 5640 + }, + { + "epoch": 0.352625, + "grad_norm": 2.4375, + "grad_norm_var": 0.01763916015625, + "learning_rate": 0.0001, + "loss": 7.3076, + "loss/crossentropy": 2.3292022943496704, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.24243366718292236, + "step": 5642 + }, + { + "epoch": 0.35275, + "grad_norm": 2.234375, + "grad_norm_var": 0.015672810872395835, + "learning_rate": 0.0001, + "loss": 7.229, + "loss/crossentropy": 2.081650137901306, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2038363665342331, + "step": 5644 + }, + { + "epoch": 0.352875, + "grad_norm": 2.09375, + "grad_norm_var": 0.016337076822916668, + "learning_rate": 0.0001, + "loss": 7.1489, + "loss/crossentropy": 2.1960020065307617, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2301100641489029, + "step": 5646 + }, + { + "epoch": 0.353, + "grad_norm": 2.234375, + "grad_norm_var": 0.014583333333333334, + "learning_rate": 0.0001, + "loss": 7.247, + "loss/crossentropy": 2.379251480102539, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.219240702688694, + "step": 5648 + }, + { + "epoch": 0.353125, + "grad_norm": 2.234375, + "grad_norm_var": 0.009273274739583334, + "learning_rate": 0.0001, + "loss": 7.3936, + "loss/crossentropy": 2.380456566810608, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22222916781902313, + "step": 5650 + }, + { + "epoch": 0.35325, + "grad_norm": 2.125, + "grad_norm_var": 0.009650675455729167, + "learning_rate": 0.0001, + "loss": 7.2234, + "loss/crossentropy": 2.2347010374069214, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22965724766254425, + "step": 5652 + }, + { + "epoch": 0.353375, + "grad_norm": 2.1875, + "grad_norm_var": 0.008226521809895833, + "learning_rate": 0.0001, + "loss": 7.4284, + "loss/crossentropy": 2.169225573539734, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20262756943702698, + "step": 5654 + }, + { + "epoch": 0.3535, + "grad_norm": 2.390625, + "grad_norm_var": 0.010026041666666667, + "learning_rate": 0.0001, + "loss": 7.226, + "loss/crossentropy": 2.189695119857788, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22685642540454865, + "step": 5656 + }, + { + "epoch": 0.353625, + "grad_norm": 2.171875, + "grad_norm_var": 0.008617146809895834, + "learning_rate": 0.0001, + "loss": 7.2095, + "loss/crossentropy": 2.382868766784668, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2241538092494011, + "step": 5658 + }, + { + "epoch": 0.35375, + "grad_norm": 2.328125, + "grad_norm_var": 0.011812337239583333, + "learning_rate": 0.0001, + "loss": 7.251, + "loss/crossentropy": 2.085647702217102, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22093788534402847, + "step": 5660 + }, + { + "epoch": 0.353875, + "grad_norm": 2.234375, + "grad_norm_var": 0.0098785400390625, + "learning_rate": 0.0001, + "loss": 7.2481, + "loss/crossentropy": 1.988870620727539, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20682383328676224, + "step": 5662 + }, + { + "epoch": 0.354, + "grad_norm": 2.25, + "grad_norm_var": 0.011031087239583333, + "learning_rate": 0.0001, + "loss": 7.3857, + "loss/crossentropy": 2.2957637310028076, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22826778888702393, + "step": 5664 + }, + { + "epoch": 0.354125, + "grad_norm": 2.21875, + "grad_norm_var": 0.010326131184895834, + "learning_rate": 0.0001, + "loss": 7.3686, + "loss/crossentropy": 2.2440401315689087, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21839071810245514, + "step": 5666 + }, + { + "epoch": 0.35425, + "grad_norm": 2.46875, + "grad_norm_var": 0.009618123372395834, + "learning_rate": 0.0001, + "loss": 7.1635, + "loss/crossentropy": 2.2398467659950256, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2106655314564705, + "step": 5668 + }, + { + "epoch": 0.354375, + "grad_norm": 2.15625, + "grad_norm_var": 0.009993489583333333, + "learning_rate": 0.0001, + "loss": 7.1971, + "loss/crossentropy": 2.2117063999176025, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22577807307243347, + "step": 5670 + }, + { + "epoch": 0.3545, + "grad_norm": 2.0625, + "grad_norm_var": 0.012824503580729167, + "learning_rate": 0.0001, + "loss": 7.2778, + "loss/crossentropy": 2.489393711090088, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22771210968494415, + "step": 5672 + }, + { + "epoch": 0.354625, + "grad_norm": 2.140625, + "grad_norm_var": 0.0126861572265625, + "learning_rate": 0.0001, + "loss": 7.2026, + "loss/crossentropy": 2.0024437308311462, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.20530618727207184, + "step": 5674 + }, + { + "epoch": 0.35475, + "grad_norm": 2.421875, + "grad_norm_var": 0.011668904622395834, + "learning_rate": 0.0001, + "loss": 7.2365, + "loss/crossentropy": 2.390307068824768, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21883082389831543, + "step": 5676 + }, + { + "epoch": 0.354875, + "grad_norm": 2.28125, + "grad_norm_var": 0.011653645833333334, + "learning_rate": 0.0001, + "loss": 7.3552, + "loss/crossentropy": 2.237827181816101, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21198499202728271, + "step": 5678 + }, + { + "epoch": 0.355, + "grad_norm": 2.0, + "grad_norm_var": 0.0149322509765625, + "learning_rate": 0.0001, + "loss": 7.1375, + "loss/crossentropy": 2.071999192237854, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20247787237167358, + "step": 5680 + }, + { + "epoch": 0.355125, + "grad_norm": 2.078125, + "grad_norm_var": 0.02051366170247396, + "learning_rate": 0.0001, + "loss": 7.2086, + "loss/crossentropy": 2.006256639957428, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.19993876665830612, + "step": 5682 + }, + { + "epoch": 0.35525, + "grad_norm": 2.09375, + "grad_norm_var": 0.016534169514973957, + "learning_rate": 0.0001, + "loss": 7.2393, + "loss/crossentropy": 2.4324631690979004, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2262727990746498, + "step": 5684 + }, + { + "epoch": 0.355375, + "grad_norm": 2.234375, + "grad_norm_var": 0.015958404541015624, + "learning_rate": 0.0001, + "loss": 7.1675, + "loss/crossentropy": 2.2547385692596436, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21899420768022537, + "step": 5686 + }, + { + "epoch": 0.3555, + "grad_norm": 2.296875, + "grad_norm_var": 0.016318511962890626, + "learning_rate": 0.0001, + "loss": 7.1257, + "loss/crossentropy": 2.3510611057281494, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21146484464406967, + "step": 5688 + }, + { + "epoch": 0.355625, + "grad_norm": 2.140625, + "grad_norm_var": 0.016094716389973958, + "learning_rate": 0.0001, + "loss": 7.2481, + "loss/crossentropy": 2.157021403312683, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2122403085231781, + "step": 5690 + }, + { + "epoch": 0.35575, + "grad_norm": 2.1875, + "grad_norm_var": 0.012463124593098958, + "learning_rate": 0.0001, + "loss": 7.3541, + "loss/crossentropy": 2.340176820755005, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21765124797821045, + "step": 5692 + }, + { + "epoch": 0.355875, + "grad_norm": 2.359375, + "grad_norm_var": 0.013303375244140625, + "learning_rate": 0.0001, + "loss": 7.4739, + "loss/crossentropy": 2.550337553024292, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2295769453048706, + "step": 5694 + }, + { + "epoch": 0.356, + "grad_norm": 2.078125, + "grad_norm_var": 0.012410227457682292, + "learning_rate": 0.0001, + "loss": 7.3001, + "loss/crossentropy": 2.221598982810974, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20915590971708298, + "step": 5696 + }, + { + "epoch": 0.356125, + "grad_norm": 2.265625, + "grad_norm_var": 0.008756510416666667, + "learning_rate": 0.0001, + "loss": 7.3883, + "loss/crossentropy": 2.3473533391952515, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20821285992860794, + "step": 5698 + }, + { + "epoch": 0.35625, + "grad_norm": 2.421875, + "grad_norm_var": 0.011335245768229167, + "learning_rate": 0.0001, + "loss": 7.1648, + "loss/crossentropy": 2.310244917869568, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21906693279743195, + "step": 5700 + }, + { + "epoch": 0.356375, + "grad_norm": 2.125, + "grad_norm_var": 0.014481608072916667, + "learning_rate": 0.0001, + "loss": 7.3204, + "loss/crossentropy": 2.333138346672058, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.23273716866970062, + "step": 5702 + }, + { + "epoch": 0.3565, + "grad_norm": 2.140625, + "grad_norm_var": 0.014069620768229167, + "learning_rate": 0.0001, + "loss": 7.1919, + "loss/crossentropy": 2.0340664386749268, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.22012364119291306, + "step": 5704 + }, + { + "epoch": 0.356625, + "grad_norm": 2.921875, + "grad_norm_var": 0.04997456868489583, + "learning_rate": 0.0001, + "loss": 7.4458, + "loss/crossentropy": 2.390481472015381, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22539138793945312, + "step": 5706 + }, + { + "epoch": 0.35675, + "grad_norm": 2.03125, + "grad_norm_var": 0.05371805826822917, + "learning_rate": 0.0001, + "loss": 7.1741, + "loss/crossentropy": 2.3384220600128174, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21385760605335236, + "step": 5708 + }, + { + "epoch": 0.356875, + "grad_norm": 2.25, + "grad_norm_var": 0.052734375, + "learning_rate": 0.0001, + "loss": 7.3415, + "loss/crossentropy": 2.266079902648926, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21113939583301544, + "step": 5710 + }, + { + "epoch": 0.357, + "grad_norm": 2.28125, + "grad_norm_var": 0.05022379557291667, + "learning_rate": 0.0001, + "loss": 7.2609, + "loss/crossentropy": 2.290347933769226, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22176006436347961, + "step": 5712 + }, + { + "epoch": 0.357125, + "grad_norm": 1.9375, + "grad_norm_var": 0.059651692708333336, + "learning_rate": 0.0001, + "loss": 7.0321, + "loss/crossentropy": 2.053887665271759, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20520412921905518, + "step": 5714 + }, + { + "epoch": 0.35725, + "grad_norm": 2.140625, + "grad_norm_var": 0.05829671223958333, + "learning_rate": 0.0001, + "loss": 7.1609, + "loss/crossentropy": 2.2182360887527466, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19742074608802795, + "step": 5716 + }, + { + "epoch": 0.357375, + "grad_norm": 2.140625, + "grad_norm_var": 0.05565999348958333, + "learning_rate": 0.0001, + "loss": 7.1431, + "loss/crossentropy": 2.2058277130126953, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20125412940979004, + "step": 5718 + }, + { + "epoch": 0.3575, + "grad_norm": 2.265625, + "grad_norm_var": 0.0562408447265625, + "learning_rate": 0.0001, + "loss": 7.252, + "loss/crossentropy": 2.047255039215088, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.19879010319709778, + "step": 5720 + }, + { + "epoch": 0.357625, + "grad_norm": 2.28125, + "grad_norm_var": 0.013508097330729166, + "learning_rate": 0.0001, + "loss": 7.1866, + "loss/crossentropy": 2.2525157928466797, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.23531018197536469, + "step": 5722 + }, + { + "epoch": 0.35775, + "grad_norm": 1.96875, + "grad_norm_var": 0.017476399739583332, + "learning_rate": 0.0001, + "loss": 6.9843, + "loss/crossentropy": 2.151524543762207, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21920562535524368, + "step": 5724 + }, + { + "epoch": 0.357875, + "grad_norm": 2.265625, + "grad_norm_var": 0.020572916666666666, + "learning_rate": 0.0001, + "loss": 7.3308, + "loss/crossentropy": 2.113102436065674, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20669876039028168, + "step": 5726 + }, + { + "epoch": 0.358, + "grad_norm": 2.359375, + "grad_norm_var": 0.023151652018229166, + "learning_rate": 0.0001, + "loss": 7.1118, + "loss/crossentropy": 2.0734687447547913, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20580605417490005, + "step": 5728 + }, + { + "epoch": 0.358125, + "grad_norm": 2.140625, + "grad_norm_var": 0.018180338541666667, + "learning_rate": 0.0001, + "loss": 7.2682, + "loss/crossentropy": 2.2568482160568237, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21253487467765808, + "step": 5730 + }, + { + "epoch": 0.35825, + "grad_norm": 2.3125, + "grad_norm_var": 0.017723592122395833, + "learning_rate": 0.0001, + "loss": 7.4277, + "loss/crossentropy": 2.1575281620025635, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20935780555009842, + "step": 5732 + }, + { + "epoch": 0.358375, + "grad_norm": 2.15625, + "grad_norm_var": 0.02252197265625, + "learning_rate": 0.0001, + "loss": 7.3061, + "loss/crossentropy": 2.1693350076675415, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21988856047391891, + "step": 5734 + }, + { + "epoch": 0.3585, + "grad_norm": 2.390625, + "grad_norm_var": 0.0248443603515625, + "learning_rate": 0.0001, + "loss": 7.3982, + "loss/crossentropy": 2.4757591485977173, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22211267799139023, + "step": 5736 + }, + { + "epoch": 0.358625, + "grad_norm": 2.203125, + "grad_norm_var": 0.022835286458333333, + "learning_rate": 0.0001, + "loss": 7.2263, + "loss/crossentropy": 1.9915843605995178, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.20769823342561722, + "step": 5738 + }, + { + "epoch": 0.35875, + "grad_norm": 2.359375, + "grad_norm_var": 0.011701456705729167, + "learning_rate": 0.0001, + "loss": 7.369, + "loss/crossentropy": 2.206750988960266, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2206248715519905, + "step": 5740 + }, + { + "epoch": 0.358875, + "grad_norm": 2.390625, + "grad_norm_var": 0.011865234375, + "learning_rate": 0.0001, + "loss": 7.373, + "loss/crossentropy": 2.46751606464386, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2296297699213028, + "step": 5742 + }, + { + "epoch": 0.359, + "grad_norm": 2.140625, + "grad_norm_var": 0.01314697265625, + "learning_rate": 0.0001, + "loss": 7.4235, + "loss/crossentropy": 2.4122776985168457, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21703825145959854, + "step": 5744 + }, + { + "epoch": 0.359125, + "grad_norm": 2.25, + "grad_norm_var": 0.012939453125, + "learning_rate": 0.0001, + "loss": 7.4364, + "loss/crossentropy": 2.4436628818511963, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21490082144737244, + "step": 5746 + }, + { + "epoch": 0.35925, + "grad_norm": 2.0625, + "grad_norm_var": 0.015934244791666666, + "learning_rate": 0.0001, + "loss": 7.0069, + "loss/crossentropy": 2.0449106693267822, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.18983326852321625, + "step": 5748 + }, + { + "epoch": 0.359375, + "grad_norm": 2.125, + "grad_norm_var": 0.013898722330729167, + "learning_rate": 0.0001, + "loss": 7.1814, + "loss/crossentropy": 2.1521427631378174, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20896754413843155, + "step": 5750 + }, + { + "epoch": 0.3595, + "grad_norm": 2.421875, + "grad_norm_var": 0.012788899739583333, + "learning_rate": 0.0001, + "loss": 7.2468, + "loss/crossentropy": 2.1787991523742676, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2136235386133194, + "step": 5752 + }, + { + "epoch": 0.359625, + "grad_norm": 2.125, + "grad_norm_var": 0.013395182291666667, + "learning_rate": 0.0001, + "loss": 7.1933, + "loss/crossentropy": 2.145695447921753, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20177044719457626, + "step": 5754 + }, + { + "epoch": 0.35975, + "grad_norm": 2.328125, + "grad_norm_var": 0.013916015625, + "learning_rate": 0.0001, + "loss": 7.2973, + "loss/crossentropy": 2.315553665161133, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21916157007217407, + "step": 5756 + }, + { + "epoch": 0.359875, + "grad_norm": 2.203125, + "grad_norm_var": 0.011351521809895833, + "learning_rate": 0.0001, + "loss": 7.2535, + "loss/crossentropy": 2.1844006776809692, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21652723848819733, + "step": 5758 + }, + { + "epoch": 0.36, + "grad_norm": 2.15625, + "grad_norm_var": 0.010936482747395834, + "learning_rate": 0.0001, + "loss": 7.1545, + "loss/crossentropy": 2.140083909034729, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2082248032093048, + "step": 5760 + }, + { + "epoch": 0.360125, + "grad_norm": 2.21875, + "grad_norm_var": 0.010640462239583334, + "learning_rate": 0.0001, + "loss": 7.3526, + "loss/crossentropy": 2.4246350526809692, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2654386907815933, + "step": 5762 + }, + { + "epoch": 0.36025, + "grad_norm": 2.34375, + "grad_norm_var": 0.011116536458333333, + "learning_rate": 0.0001, + "loss": 7.236, + "loss/crossentropy": 2.190550684928894, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21372459083795547, + "step": 5764 + }, + { + "epoch": 0.360375, + "grad_norm": 2.1875, + "grad_norm_var": 0.0099517822265625, + "learning_rate": 0.0001, + "loss": 7.198, + "loss/crossentropy": 2.2206780910491943, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21862171590328217, + "step": 5766 + }, + { + "epoch": 0.3605, + "grad_norm": 2.203125, + "grad_norm_var": 0.008036295572916666, + "learning_rate": 0.0001, + "loss": 7.139, + "loss/crossentropy": 2.0877394676208496, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21827848255634308, + "step": 5768 + }, + { + "epoch": 0.360625, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010497792561848959, + "learning_rate": 0.0001, + "loss": 7.1267, + "loss/crossentropy": 2.1852867603302, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.19439171254634857, + "step": 5770 + }, + { + "epoch": 0.36075, + "grad_norm": 2.359375, + "grad_norm_var": 0.009500885009765625, + "learning_rate": 0.0001, + "loss": 7.2532, + "loss/crossentropy": 2.2738914489746094, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20999271422624588, + "step": 5772 + }, + { + "epoch": 0.360875, + "grad_norm": 2.140625, + "grad_norm_var": 0.010184478759765626, + "learning_rate": 0.0001, + "loss": 7.3997, + "loss/crossentropy": 2.32577121257782, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20629432797431946, + "step": 5774 + }, + { + "epoch": 0.361, + "grad_norm": 2.53125, + "grad_norm_var": 0.017978668212890625, + "learning_rate": 0.0001, + "loss": 7.205, + "loss/crossentropy": 2.3490023612976074, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23214927315711975, + "step": 5776 + }, + { + "epoch": 0.361125, + "grad_norm": 2.234375, + "grad_norm_var": 0.02216364542643229, + "learning_rate": 0.0001, + "loss": 7.2525, + "loss/crossentropy": 2.041129231452942, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.1971891075372696, + "step": 5778 + }, + { + "epoch": 0.36125, + "grad_norm": 2.28125, + "grad_norm_var": 0.020957183837890626, + "learning_rate": 0.0001, + "loss": 7.1419, + "loss/crossentropy": 2.4754010438919067, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2261795774102211, + "step": 5780 + }, + { + "epoch": 0.361375, + "grad_norm": 2.265625, + "grad_norm_var": 0.020947011311848958, + "learning_rate": 0.0001, + "loss": 7.4145, + "loss/crossentropy": 2.3293362855911255, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2322985827922821, + "step": 5782 + }, + { + "epoch": 0.3615, + "grad_norm": 2.21875, + "grad_norm_var": 0.019608306884765624, + "learning_rate": 0.0001, + "loss": 7.2454, + "loss/crossentropy": 2.2818493843078613, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2164430171251297, + "step": 5784 + }, + { + "epoch": 0.361625, + "grad_norm": 2.125, + "grad_norm_var": 0.0157379150390625, + "learning_rate": 0.0001, + "loss": 7.3042, + "loss/crossentropy": 2.418179750442505, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22034931927919388, + "step": 5786 + }, + { + "epoch": 0.36175, + "grad_norm": 2.453125, + "grad_norm_var": 0.017431640625, + "learning_rate": 0.0001, + "loss": 7.2657, + "loss/crossentropy": 2.1933167576789856, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2206980139017105, + "step": 5788 + }, + { + "epoch": 0.361875, + "grad_norm": 2.453125, + "grad_norm_var": 0.018863932291666666, + "learning_rate": 0.0001, + "loss": 7.175, + "loss/crossentropy": 2.2669215202331543, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.20475652068853378, + "step": 5790 + }, + { + "epoch": 0.362, + "grad_norm": 2.15625, + "grad_norm_var": 0.017643229166666666, + "learning_rate": 0.0001, + "loss": 7.247, + "loss/crossentropy": 2.1614081859588623, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20143014192581177, + "step": 5792 + }, + { + "epoch": 0.362125, + "grad_norm": 2.09375, + "grad_norm_var": 0.020113118489583335, + "learning_rate": 0.0001, + "loss": 7.1625, + "loss/crossentropy": 2.088079333305359, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20542144775390625, + "step": 5794 + }, + { + "epoch": 0.36225, + "grad_norm": 2.234375, + "grad_norm_var": 0.019310506184895833, + "learning_rate": 0.0001, + "loss": 7.362, + "loss/crossentropy": 2.2279645204544067, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2040068879723549, + "step": 5796 + }, + { + "epoch": 0.362375, + "grad_norm": 2.203125, + "grad_norm_var": 0.018903605143229165, + "learning_rate": 0.0001, + "loss": 7.1413, + "loss/crossentropy": 2.3002779483795166, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2167448326945305, + "step": 5798 + }, + { + "epoch": 0.3625, + "grad_norm": 2.28125, + "grad_norm_var": 0.01959228515625, + "learning_rate": 0.0001, + "loss": 7.2912, + "loss/crossentropy": 2.49115788936615, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22317777574062347, + "step": 5800 + }, + { + "epoch": 0.362625, + "grad_norm": 2.171875, + "grad_norm_var": 0.019188435872395833, + "learning_rate": 0.0001, + "loss": 7.2608, + "loss/crossentropy": 2.2841527462005615, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2520540952682495, + "step": 5802 + }, + { + "epoch": 0.36275, + "grad_norm": 2.1875, + "grad_norm_var": 0.017210896809895834, + "learning_rate": 0.0001, + "loss": 7.1874, + "loss/crossentropy": 2.3316045999526978, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.20709974318742752, + "step": 5804 + }, + { + "epoch": 0.362875, + "grad_norm": 2.234375, + "grad_norm_var": 0.014354451497395834, + "learning_rate": 0.0001, + "loss": 7.2947, + "loss/crossentropy": 2.1988085508346558, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21208590269088745, + "step": 5806 + }, + { + "epoch": 0.363, + "grad_norm": 2.171875, + "grad_norm_var": 0.007710774739583333, + "learning_rate": 0.0001, + "loss": 7.2853, + "loss/crossentropy": 2.404086947441101, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.23354601860046387, + "step": 5808 + }, + { + "epoch": 0.363125, + "grad_norm": 2.296875, + "grad_norm_var": 0.0051422119140625, + "learning_rate": 0.0001, + "loss": 7.3537, + "loss/crossentropy": 2.3549355268478394, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21141213178634644, + "step": 5810 + }, + { + "epoch": 0.36325, + "grad_norm": 2.140625, + "grad_norm_var": 0.00552978515625, + "learning_rate": 0.0001, + "loss": 7.3451, + "loss/crossentropy": 2.428610682487488, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22301796823740005, + "step": 5812 + }, + { + "epoch": 0.363375, + "grad_norm": 2.3125, + "grad_norm_var": 0.0060536702473958336, + "learning_rate": 0.0001, + "loss": 7.3552, + "loss/crossentropy": 2.033466935157776, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.196543850004673, + "step": 5814 + }, + { + "epoch": 0.3635, + "grad_norm": 2.28125, + "grad_norm_var": 0.005150349934895834, + "learning_rate": 0.0001, + "loss": 7.3336, + "loss/crossentropy": 2.324714779853821, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21623509377241135, + "step": 5816 + }, + { + "epoch": 0.363625, + "grad_norm": 2.109375, + "grad_norm_var": 0.004976399739583333, + "learning_rate": 0.0001, + "loss": 7.1483, + "loss/crossentropy": 2.2404175996780396, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20572540163993835, + "step": 5818 + }, + { + "epoch": 0.36375, + "grad_norm": 2.59375, + "grad_norm_var": 0.014676920572916667, + "learning_rate": 0.0001, + "loss": 7.375, + "loss/crossentropy": 2.269519567489624, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2145681381225586, + "step": 5820 + }, + { + "epoch": 0.363875, + "grad_norm": 2.046875, + "grad_norm_var": 0.016779581705729168, + "learning_rate": 0.0001, + "loss": 7.2622, + "loss/crossentropy": 2.202036142349243, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2079039216041565, + "step": 5822 + }, + { + "epoch": 0.364, + "grad_norm": 2.203125, + "grad_norm_var": 0.0169097900390625, + "learning_rate": 0.0001, + "loss": 7.193, + "loss/crossentropy": 2.546125888824463, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21474044024944305, + "step": 5824 + }, + { + "epoch": 0.364125, + "grad_norm": 2.328125, + "grad_norm_var": 0.016624959309895833, + "learning_rate": 0.0001, + "loss": 7.2476, + "loss/crossentropy": 2.285582184791565, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21190346777439117, + "step": 5826 + }, + { + "epoch": 0.36425, + "grad_norm": 2.28125, + "grad_norm_var": 0.016373697916666666, + "learning_rate": 0.0001, + "loss": 7.3289, + "loss/crossentropy": 2.239107668399811, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.19788258522748947, + "step": 5828 + }, + { + "epoch": 0.364375, + "grad_norm": 2.09375, + "grad_norm_var": 0.017015584309895835, + "learning_rate": 0.0001, + "loss": 7.3511, + "loss/crossentropy": 2.401890516281128, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2497824877500534, + "step": 5830 + }, + { + "epoch": 0.3645, + "grad_norm": 2.296875, + "grad_norm_var": 0.019401041666666667, + "learning_rate": 0.0001, + "loss": 7.2613, + "loss/crossentropy": 2.2030797004699707, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.1958223581314087, + "step": 5832 + }, + { + "epoch": 0.364625, + "grad_norm": 2.3125, + "grad_norm_var": 0.0217926025390625, + "learning_rate": 0.0001, + "loss": 7.2132, + "loss/crossentropy": 2.241069197654724, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22120070457458496, + "step": 5834 + }, + { + "epoch": 0.36475, + "grad_norm": 2.15625, + "grad_norm_var": 0.014045206705729167, + "learning_rate": 0.0001, + "loss": 7.3255, + "loss/crossentropy": 2.346863269805908, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22068895399570465, + "step": 5836 + }, + { + "epoch": 0.364875, + "grad_norm": 2.484375, + "grad_norm_var": 0.015950520833333332, + "learning_rate": 0.0001, + "loss": 7.3054, + "loss/crossentropy": 2.157726287841797, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.19728650152683258, + "step": 5838 + }, + { + "epoch": 0.365, + "grad_norm": 2.203125, + "grad_norm_var": 0.0157867431640625, + "learning_rate": 0.0001, + "loss": 7.1046, + "loss/crossentropy": 2.1430864334106445, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19296472519636154, + "step": 5840 + }, + { + "epoch": 0.365125, + "grad_norm": 2.25, + "grad_norm_var": 0.015152994791666667, + "learning_rate": 0.0001, + "loss": 7.2594, + "loss/crossentropy": 2.453359365463257, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2097448706626892, + "step": 5842 + }, + { + "epoch": 0.36525, + "grad_norm": 2.359375, + "grad_norm_var": 0.015885416666666666, + "learning_rate": 0.0001, + "loss": 7.2375, + "loss/crossentropy": 2.137434482574463, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19895398616790771, + "step": 5844 + }, + { + "epoch": 0.365375, + "grad_norm": 2.203125, + "grad_norm_var": 0.014354451497395834, + "learning_rate": 0.0001, + "loss": 7.3807, + "loss/crossentropy": 2.1278064846992493, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2288549244403839, + "step": 5846 + }, + { + "epoch": 0.3655, + "grad_norm": 2.140625, + "grad_norm_var": 0.0134918212890625, + "learning_rate": 0.0001, + "loss": 7.1056, + "loss/crossentropy": 2.465573310852051, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22544366866350174, + "step": 5848 + }, + { + "epoch": 0.365625, + "grad_norm": 2.09375, + "grad_norm_var": 0.011649576822916667, + "learning_rate": 0.0001, + "loss": 7.2308, + "loss/crossentropy": 1.9526810050010681, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19054138660430908, + "step": 5850 + }, + { + "epoch": 0.36575, + "grad_norm": 2.625, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 7.3444, + "loss/crossentropy": 2.3907413482666016, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22131036221981049, + "step": 5852 + }, + { + "epoch": 0.365875, + "grad_norm": 2.15625, + "grad_norm_var": 0.020210774739583333, + "learning_rate": 0.0001, + "loss": 7.3632, + "loss/crossentropy": 2.4008761644363403, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22065234184265137, + "step": 5854 + }, + { + "epoch": 0.366, + "grad_norm": 2.078125, + "grad_norm_var": 0.023111979166666668, + "learning_rate": 0.0001, + "loss": 7.1694, + "loss/crossentropy": 2.389074921607971, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21668195724487305, + "step": 5856 + }, + { + "epoch": 0.366125, + "grad_norm": 2.203125, + "grad_norm_var": 0.023469034830729166, + "learning_rate": 0.0001, + "loss": 7.1397, + "loss/crossentropy": 2.4073052406311035, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2183263897895813, + "step": 5858 + }, + { + "epoch": 0.36625, + "grad_norm": 2.234375, + "grad_norm_var": 0.02408447265625, + "learning_rate": 0.0001, + "loss": 7.2806, + "loss/crossentropy": 2.45753812789917, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22091014683246613, + "step": 5860 + }, + { + "epoch": 0.366375, + "grad_norm": 2.15625, + "grad_norm_var": 0.024247233072916666, + "learning_rate": 0.0001, + "loss": 7.1823, + "loss/crossentropy": 2.2068198919296265, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2208571657538414, + "step": 5862 + }, + { + "epoch": 0.3665, + "grad_norm": 1.9609375, + "grad_norm_var": 0.027530670166015625, + "learning_rate": 0.0001, + "loss": 7.0967, + "loss/crossentropy": 2.0362515449523926, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.1910586953163147, + "step": 5864 + }, + { + "epoch": 0.366625, + "grad_norm": 2.296875, + "grad_norm_var": 0.02529881795247396, + "learning_rate": 0.0001, + "loss": 7.2175, + "loss/crossentropy": 2.3787707090377808, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20581073313951492, + "step": 5866 + }, + { + "epoch": 0.36675, + "grad_norm": 2.09375, + "grad_norm_var": 0.014542388916015624, + "learning_rate": 0.0001, + "loss": 7.112, + "loss/crossentropy": 2.222362995147705, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21793870627880096, + "step": 5868 + }, + { + "epoch": 0.366875, + "grad_norm": 2.265625, + "grad_norm_var": 0.011909739176432291, + "learning_rate": 0.0001, + "loss": 7.1673, + "loss/crossentropy": 2.208632707595825, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22443342208862305, + "step": 5870 + }, + { + "epoch": 0.367, + "grad_norm": 2.1875, + "grad_norm_var": 0.010330963134765624, + "learning_rate": 0.0001, + "loss": 7.3225, + "loss/crossentropy": 1.9650804996490479, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20253446698188782, + "step": 5872 + }, + { + "epoch": 0.367125, + "grad_norm": 2.15625, + "grad_norm_var": 0.011270904541015625, + "learning_rate": 0.0001, + "loss": 7.2269, + "loss/crossentropy": 2.1838788986206055, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22055459022521973, + "step": 5874 + }, + { + "epoch": 0.36725, + "grad_norm": 2.4375, + "grad_norm_var": 0.013565826416015624, + "learning_rate": 0.0001, + "loss": 7.0766, + "loss/crossentropy": 2.2628434896469116, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20941808074712753, + "step": 5876 + }, + { + "epoch": 0.367375, + "grad_norm": 1.953125, + "grad_norm_var": 0.01853205362955729, + "learning_rate": 0.0001, + "loss": 7.278, + "loss/crossentropy": 2.1746416687965393, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2203633338212967, + "step": 5878 + }, + { + "epoch": 0.3675, + "grad_norm": 2.40625, + "grad_norm_var": 0.017560831705729165, + "learning_rate": 0.0001, + "loss": 7.2906, + "loss/crossentropy": 2.2157901525497437, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20976532995700836, + "step": 5880 + }, + { + "epoch": 0.367625, + "grad_norm": 2.015625, + "grad_norm_var": 0.020563761393229168, + "learning_rate": 0.0001, + "loss": 7.2316, + "loss/crossentropy": 2.40118670463562, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22522136569023132, + "step": 5882 + }, + { + "epoch": 0.36775, + "grad_norm": 2.28125, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 7.2647, + "loss/crossentropy": 2.5736021995544434, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2218579277396202, + "step": 5884 + }, + { + "epoch": 0.367875, + "grad_norm": 2.34375, + "grad_norm_var": 0.020270792643229167, + "learning_rate": 0.0001, + "loss": 7.3696, + "loss/crossentropy": 2.074417471885681, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.21545039117336273, + "step": 5886 + }, + { + "epoch": 0.368, + "grad_norm": 2.125, + "grad_norm_var": 0.020865885416666667, + "learning_rate": 0.0001, + "loss": 7.1881, + "loss/crossentropy": 2.3205907344818115, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20773284137248993, + "step": 5888 + }, + { + "epoch": 0.368125, + "grad_norm": 2.34375, + "grad_norm_var": 0.021393839518229166, + "learning_rate": 0.0001, + "loss": 7.073, + "loss/crossentropy": 1.872346818447113, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.204973466694355, + "step": 5890 + }, + { + "epoch": 0.36825, + "grad_norm": 2.171875, + "grad_norm_var": 0.017332967122395834, + "learning_rate": 0.0001, + "loss": 7.2071, + "loss/crossentropy": 2.166461229324341, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21400277316570282, + "step": 5892 + }, + { + "epoch": 0.368375, + "grad_norm": 2.265625, + "grad_norm_var": 0.012279256184895834, + "learning_rate": 0.0001, + "loss": 7.1804, + "loss/crossentropy": 2.2579694986343384, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22205153107643127, + "step": 5894 + }, + { + "epoch": 0.3685, + "grad_norm": 2.234375, + "grad_norm_var": 0.008882649739583333, + "learning_rate": 0.0001, + "loss": 7.3781, + "loss/crossentropy": 2.3255600929260254, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20994628965854645, + "step": 5896 + }, + { + "epoch": 0.368625, + "grad_norm": 2.140625, + "grad_norm_var": 0.00933837890625, + "learning_rate": 0.0001, + "loss": 7.2894, + "loss/crossentropy": 2.6723464727401733, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.21977736055850983, + "step": 5898 + }, + { + "epoch": 0.36875, + "grad_norm": 2.265625, + "grad_norm_var": 0.010212198893229166, + "learning_rate": 0.0001, + "loss": 7.2128, + "loss/crossentropy": 2.2925814390182495, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.24049442261457443, + "step": 5900 + }, + { + "epoch": 0.368875, + "grad_norm": 2.359375, + "grad_norm_var": 0.0120513916015625, + "learning_rate": 0.0001, + "loss": 7.5009, + "loss/crossentropy": 2.6006277799606323, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22706793248653412, + "step": 5902 + }, + { + "epoch": 0.369, + "grad_norm": 2.34375, + "grad_norm_var": 0.0119537353515625, + "learning_rate": 0.0001, + "loss": 7.2934, + "loss/crossentropy": 2.4217220544815063, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23213213682174683, + "step": 5904 + }, + { + "epoch": 0.369125, + "grad_norm": 2.1875, + "grad_norm_var": 0.01031494140625, + "learning_rate": 0.0001, + "loss": 7.2382, + "loss/crossentropy": 2.1426541805267334, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.18587464094161987, + "step": 5906 + }, + { + "epoch": 0.36925, + "grad_norm": 2.25, + "grad_norm_var": 0.013036092122395834, + "learning_rate": 0.0001, + "loss": 7.1082, + "loss/crossentropy": 2.2752933502197266, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21011126041412354, + "step": 5908 + }, + { + "epoch": 0.369375, + "grad_norm": 2.546875, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 7.3066, + "loss/crossentropy": 2.1513302326202393, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2277527153491974, + "step": 5910 + }, + { + "epoch": 0.3695, + "grad_norm": 2.328125, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 7.2429, + "loss/crossentropy": 2.327723264694214, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22254379838705063, + "step": 5912 + }, + { + "epoch": 0.369625, + "grad_norm": 2.234375, + "grad_norm_var": 0.01705322265625, + "learning_rate": 0.0001, + "loss": 7.1405, + "loss/crossentropy": 2.0437814593315125, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2087857574224472, + "step": 5914 + }, + { + "epoch": 0.36975, + "grad_norm": 2.09375, + "grad_norm_var": 0.0169342041015625, + "learning_rate": 0.0001, + "loss": 7.464, + "loss/crossentropy": 2.3539143800735474, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21950966119766235, + "step": 5916 + }, + { + "epoch": 0.369875, + "grad_norm": 2.203125, + "grad_norm_var": 0.01630859375, + "learning_rate": 0.0001, + "loss": 7.2979, + "loss/crossentropy": 2.4984445571899414, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.19562814384698868, + "step": 5918 + }, + { + "epoch": 0.37, + "grad_norm": 2.1875, + "grad_norm_var": 0.015355428059895834, + "learning_rate": 0.0001, + "loss": 7.3028, + "loss/crossentropy": 2.4766656160354614, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21741511672735214, + "step": 5920 + }, + { + "epoch": 0.370125, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020672353108723958, + "learning_rate": 0.0001, + "loss": 7.1708, + "loss/crossentropy": 2.243171215057373, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2148171290755272, + "step": 5922 + }, + { + "epoch": 0.37025, + "grad_norm": 2.375, + "grad_norm_var": 0.01923192342122396, + "learning_rate": 0.0001, + "loss": 7.321, + "loss/crossentropy": 2.036896765232086, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2031344696879387, + "step": 5924 + }, + { + "epoch": 0.370375, + "grad_norm": 2.15625, + "grad_norm_var": 0.011864980061848959, + "learning_rate": 0.0001, + "loss": 7.2207, + "loss/crossentropy": 2.084986448287964, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21330830454826355, + "step": 5926 + }, + { + "epoch": 0.3705, + "grad_norm": 2.140625, + "grad_norm_var": 0.009710439046223958, + "learning_rate": 0.0001, + "loss": 7.2631, + "loss/crossentropy": 2.224330186843872, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21560171246528625, + "step": 5928 + }, + { + "epoch": 0.370625, + "grad_norm": 2.34375, + "grad_norm_var": 0.011277008056640624, + "learning_rate": 0.0001, + "loss": 7.311, + "loss/crossentropy": 2.49066960811615, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21737707406282425, + "step": 5930 + }, + { + "epoch": 0.37075, + "grad_norm": 2.046875, + "grad_norm_var": 0.010528310139973959, + "learning_rate": 0.0001, + "loss": 7.3316, + "loss/crossentropy": 2.3138331174850464, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21158859878778458, + "step": 5932 + }, + { + "epoch": 0.370875, + "grad_norm": 2.1875, + "grad_norm_var": 0.010483551025390624, + "learning_rate": 0.0001, + "loss": 7.1589, + "loss/crossentropy": 2.152156710624695, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2170645147562027, + "step": 5934 + }, + { + "epoch": 0.371, + "grad_norm": 2.21875, + "grad_norm_var": 0.010603586832682291, + "learning_rate": 0.0001, + "loss": 7.2144, + "loss/crossentropy": 2.0164735317230225, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.1979883462190628, + "step": 5936 + }, + { + "epoch": 0.371125, + "grad_norm": 2.25, + "grad_norm_var": 0.007059733072916667, + "learning_rate": 0.0001, + "loss": 7.0243, + "loss/crossentropy": 1.9355103373527527, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20030242949724197, + "step": 5938 + }, + { + "epoch": 0.37125, + "grad_norm": 2.125, + "grad_norm_var": 0.005646769205729167, + "learning_rate": 0.0001, + "loss": 7.2098, + "loss/crossentropy": 2.3397440910339355, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21775859594345093, + "step": 5940 + }, + { + "epoch": 0.371375, + "grad_norm": 1.984375, + "grad_norm_var": 0.00806884765625, + "learning_rate": 0.0001, + "loss": 7.0602, + "loss/crossentropy": 2.1412546634674072, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22875963151454926, + "step": 5942 + }, + { + "epoch": 0.3715, + "grad_norm": 2.53125, + "grad_norm_var": 0.018485514322916667, + "learning_rate": 0.0001, + "loss": 7.2662, + "loss/crossentropy": 2.4508490562438965, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2223380208015442, + "step": 5944 + }, + { + "epoch": 0.371625, + "grad_norm": 2.140625, + "grad_norm_var": 0.0183990478515625, + "learning_rate": 0.0001, + "loss": 7.3082, + "loss/crossentropy": 2.4562530517578125, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2685226500034332, + "step": 5946 + }, + { + "epoch": 0.37175, + "grad_norm": 2.125, + "grad_norm_var": 0.017411295572916666, + "learning_rate": 0.0001, + "loss": 7.1859, + "loss/crossentropy": 2.600165367126465, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2223629280924797, + "step": 5948 + }, + { + "epoch": 0.371875, + "grad_norm": 2.1875, + "grad_norm_var": 0.017024739583333334, + "learning_rate": 0.0001, + "loss": 7.3726, + "loss/crossentropy": 2.579715847969055, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22339096665382385, + "step": 5950 + }, + { + "epoch": 0.372, + "grad_norm": 2.109375, + "grad_norm_var": 0.017964680989583332, + "learning_rate": 0.0001, + "loss": 7.1321, + "loss/crossentropy": 2.3544031381607056, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2049936279654503, + "step": 5952 + }, + { + "epoch": 0.372125, + "grad_norm": 2.296875, + "grad_norm_var": 0.018651326497395832, + "learning_rate": 0.0001, + "loss": 7.3593, + "loss/crossentropy": 2.570642113685608, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2149905562400818, + "step": 5954 + }, + { + "epoch": 0.37225, + "grad_norm": 2.203125, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 7.2501, + "loss/crossentropy": 2.345090627670288, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21142956614494324, + "step": 5956 + }, + { + "epoch": 0.372375, + "grad_norm": 2.21875, + "grad_norm_var": 0.014322916666666666, + "learning_rate": 0.0001, + "loss": 7.2556, + "loss/crossentropy": 2.083262085914612, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.21225877851247787, + "step": 5958 + }, + { + "epoch": 0.3725, + "grad_norm": 2.328125, + "grad_norm_var": 0.0090728759765625, + "learning_rate": 0.0001, + "loss": 7.3038, + "loss/crossentropy": 2.310503602027893, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2126244381070137, + "step": 5960 + }, + { + "epoch": 0.372625, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011934153238932292, + "learning_rate": 0.0001, + "loss": 7.1571, + "loss/crossentropy": 2.414725184440613, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2153559774160385, + "step": 5962 + }, + { + "epoch": 0.37275, + "grad_norm": 2.078125, + "grad_norm_var": 0.013293202718098958, + "learning_rate": 0.0001, + "loss": 7.1466, + "loss/crossentropy": 2.1802865266799927, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.19948512315750122, + "step": 5964 + }, + { + "epoch": 0.372875, + "grad_norm": 2.265625, + "grad_norm_var": 0.013588205973307291, + "learning_rate": 0.0001, + "loss": 7.3194, + "loss/crossentropy": 2.2380692958831787, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21608977019786835, + "step": 5966 + }, + { + "epoch": 0.373, + "grad_norm": 2.21875, + "grad_norm_var": 0.013390858968098959, + "learning_rate": 0.0001, + "loss": 7.216, + "loss/crossentropy": 2.2425453662872314, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2108849212527275, + "step": 5968 + }, + { + "epoch": 0.373125, + "grad_norm": 2.171875, + "grad_norm_var": 0.012666575113932292, + "learning_rate": 0.0001, + "loss": 7.163, + "loss/crossentropy": 2.289492964744568, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20529530197381973, + "step": 5970 + }, + { + "epoch": 0.37325, + "grad_norm": 2.125, + "grad_norm_var": 0.012385813395182292, + "learning_rate": 0.0001, + "loss": 7.1534, + "loss/crossentropy": 1.9850167036056519, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2159864455461502, + "step": 5972 + }, + { + "epoch": 0.373375, + "grad_norm": 2.53125, + "grad_norm_var": 0.019954172770182292, + "learning_rate": 0.0001, + "loss": 7.1503, + "loss/crossentropy": 2.3028509616851807, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21371204406023026, + "step": 5974 + }, + { + "epoch": 0.3735, + "grad_norm": 2.109375, + "grad_norm_var": 0.01669286092122396, + "learning_rate": 0.0001, + "loss": 7.2954, + "loss/crossentropy": 2.3715981245040894, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21668031811714172, + "step": 5976 + }, + { + "epoch": 0.373625, + "grad_norm": 2.3125, + "grad_norm_var": 0.016048177083333334, + "learning_rate": 0.0001, + "loss": 7.2445, + "loss/crossentropy": 2.125378727912903, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20972995460033417, + "step": 5978 + }, + { + "epoch": 0.37375, + "grad_norm": 2.15625, + "grad_norm_var": 0.014176432291666667, + "learning_rate": 0.0001, + "loss": 7.148, + "loss/crossentropy": 2.019354999065399, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.18493575602769852, + "step": 5980 + }, + { + "epoch": 0.373875, + "grad_norm": 2.234375, + "grad_norm_var": 0.016136678059895833, + "learning_rate": 0.0001, + "loss": 7.2904, + "loss/crossentropy": 2.051500916481018, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21599026024341583, + "step": 5982 + }, + { + "epoch": 0.374, + "grad_norm": 2.21875, + "grad_norm_var": 0.017601521809895833, + "learning_rate": 0.0001, + "loss": 7.4102, + "loss/crossentropy": 2.1651517152786255, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21314673870801926, + "step": 5984 + }, + { + "epoch": 0.374125, + "grad_norm": 2.0625, + "grad_norm_var": 0.01842041015625, + "learning_rate": 0.0001, + "loss": 7.2403, + "loss/crossentropy": 2.254274010658264, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2058999389410019, + "step": 5986 + }, + { + "epoch": 0.37425, + "grad_norm": 2.28125, + "grad_norm_var": 0.018680826822916666, + "learning_rate": 0.0001, + "loss": 7.3251, + "loss/crossentropy": 2.346308946609497, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20627041906118393, + "step": 5988 + }, + { + "epoch": 0.374375, + "grad_norm": 2.15625, + "grad_norm_var": 0.011356608072916666, + "learning_rate": 0.0001, + "loss": 7.2133, + "loss/crossentropy": 2.4631309509277344, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2179097756743431, + "step": 5990 + }, + { + "epoch": 0.3745, + "grad_norm": 2.21875, + "grad_norm_var": 0.0103424072265625, + "learning_rate": 0.0001, + "loss": 7.2366, + "loss/crossentropy": 2.558820962905884, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20852772891521454, + "step": 5992 + }, + { + "epoch": 0.374625, + "grad_norm": 2.140625, + "grad_norm_var": 0.007982381184895833, + "learning_rate": 0.0001, + "loss": 7.2517, + "loss/crossentropy": 2.252416253089905, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21487032622098923, + "step": 5994 + }, + { + "epoch": 0.37475, + "grad_norm": 2.515625, + "grad_norm_var": 0.015999348958333333, + "learning_rate": 0.0001, + "loss": 7.3158, + "loss/crossentropy": 2.4063336849212646, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22586210072040558, + "step": 5996 + }, + { + "epoch": 0.374875, + "grad_norm": 2.21875, + "grad_norm_var": 0.013179524739583334, + "learning_rate": 0.0001, + "loss": 7.1276, + "loss/crossentropy": 1.848585605621338, + "loss/hidden": 2.71875, + "loss/jsd": 0.0, + "loss/logits": 0.19894341379404068, + "step": 5998 + }, + { + "epoch": 0.375, + "grad_norm": 2.34375, + "grad_norm_var": 0.013654581705729167, + "learning_rate": 0.0001, + "loss": 7.1552, + "loss/crossentropy": 2.4806891679763794, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2110806256532669, + "step": 6000 + }, + { + "epoch": 0.375125, + "grad_norm": 2.328125, + "grad_norm_var": 0.012451171875, + "learning_rate": 0.0001, + "loss": 7.2238, + "loss/crossentropy": 2.1955052614212036, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2050733044743538, + "step": 6002 + }, + { + "epoch": 0.37525, + "grad_norm": 2.0, + "grad_norm_var": 0.014990234375, + "learning_rate": 0.0001, + "loss": 7.2147, + "loss/crossentropy": 2.2241486310958862, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20495334267616272, + "step": 6004 + }, + { + "epoch": 0.375375, + "grad_norm": 2.203125, + "grad_norm_var": 0.015355428059895834, + "learning_rate": 0.0001, + "loss": 7.0965, + "loss/crossentropy": 2.167006492614746, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20993337780237198, + "step": 6006 + }, + { + "epoch": 0.3755, + "grad_norm": 2.25, + "grad_norm_var": 0.015620930989583334, + "learning_rate": 0.0001, + "loss": 7.1475, + "loss/crossentropy": 2.1680710911750793, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2089712917804718, + "step": 6008 + }, + { + "epoch": 0.375625, + "grad_norm": 2.28125, + "grad_norm_var": 0.015555826822916667, + "learning_rate": 0.0001, + "loss": 7.1355, + "loss/crossentropy": 2.338285446166992, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21749289333820343, + "step": 6010 + }, + { + "epoch": 0.37575, + "grad_norm": 2.1875, + "grad_norm_var": 0.0079498291015625, + "learning_rate": 0.0001, + "loss": 7.1924, + "loss/crossentropy": 2.2684131860733032, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2123074233531952, + "step": 6012 + }, + { + "epoch": 0.375875, + "grad_norm": 2.296875, + "grad_norm_var": 0.008429972330729167, + "learning_rate": 0.0001, + "loss": 7.2788, + "loss/crossentropy": 2.6743900775909424, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21920502185821533, + "step": 6014 + }, + { + "epoch": 0.376, + "grad_norm": 2.296875, + "grad_norm_var": 0.006598917643229166, + "learning_rate": 0.0001, + "loss": 7.3491, + "loss/crossentropy": 2.175094962120056, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2208651378750801, + "step": 6016 + }, + { + "epoch": 0.376125, + "grad_norm": 2.15625, + "grad_norm_var": 0.005757649739583333, + "learning_rate": 0.0001, + "loss": 7.2814, + "loss/crossentropy": 2.417030453681946, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2150922417640686, + "step": 6018 + }, + { + "epoch": 0.37625, + "grad_norm": 2.296875, + "grad_norm_var": 0.004069010416666667, + "learning_rate": 0.0001, + "loss": 7.1846, + "loss/crossentropy": 2.065057873725891, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.18963027000427246, + "step": 6020 + }, + { + "epoch": 0.376375, + "grad_norm": 2.1875, + "grad_norm_var": 0.0033843994140625, + "learning_rate": 0.0001, + "loss": 7.099, + "loss/crossentropy": 2.0630787014961243, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2001161426305771, + "step": 6022 + }, + { + "epoch": 0.3765, + "grad_norm": 2.09375, + "grad_norm_var": 0.005052693684895833, + "learning_rate": 0.0001, + "loss": 7.2794, + "loss/crossentropy": 2.018410086631775, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21171020716428757, + "step": 6024 + }, + { + "epoch": 0.376625, + "grad_norm": 2.140625, + "grad_norm_var": 0.007242838541666667, + "learning_rate": 0.0001, + "loss": 7.04, + "loss/crossentropy": 2.0343552231788635, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.1925143450498581, + "step": 6026 + }, + { + "epoch": 0.37675, + "grad_norm": 2.265625, + "grad_norm_var": 0.007731119791666667, + "learning_rate": 0.0001, + "loss": 7.2422, + "loss/crossentropy": 2.3210513591766357, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2177387848496437, + "step": 6028 + }, + { + "epoch": 0.376875, + "grad_norm": 2.203125, + "grad_norm_var": 0.007096354166666667, + "learning_rate": 0.0001, + "loss": 7.0303, + "loss/crossentropy": 2.0913103818893433, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19993747025728226, + "step": 6030 + }, + { + "epoch": 0.377, + "grad_norm": 2.15625, + "grad_norm_var": 0.00611572265625, + "learning_rate": 0.0001, + "loss": 7.2291, + "loss/crossentropy": 2.2609927654266357, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21157050877809525, + "step": 6032 + }, + { + "epoch": 0.377125, + "grad_norm": 2.296875, + "grad_norm_var": 0.007161458333333333, + "learning_rate": 0.0001, + "loss": 7.1153, + "loss/crossentropy": 2.5453518629074097, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22479583323001862, + "step": 6034 + }, + { + "epoch": 0.37725, + "grad_norm": 2.1875, + "grad_norm_var": 0.005952962239583333, + "learning_rate": 0.0001, + "loss": 7.2865, + "loss/crossentropy": 2.421576499938965, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22091317176818848, + "step": 6036 + }, + { + "epoch": 0.377375, + "grad_norm": 2.046875, + "grad_norm_var": 0.0066721598307291664, + "learning_rate": 0.0001, + "loss": 7.2464, + "loss/crossentropy": 2.370941162109375, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21858198940753937, + "step": 6038 + }, + { + "epoch": 0.3775, + "grad_norm": 2.171875, + "grad_norm_var": 0.006004842122395834, + "learning_rate": 0.0001, + "loss": 7.2514, + "loss/crossentropy": 2.3779879808425903, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21413833647966385, + "step": 6040 + }, + { + "epoch": 0.377625, + "grad_norm": 2.34375, + "grad_norm_var": 0.005671183268229167, + "learning_rate": 0.0001, + "loss": 7.3732, + "loss/crossentropy": 2.073517680168152, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19559810310602188, + "step": 6042 + }, + { + "epoch": 0.37775, + "grad_norm": 2.125, + "grad_norm_var": 0.005810546875, + "learning_rate": 0.0001, + "loss": 7.2228, + "loss/crossentropy": 2.136751651763916, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20897220820188522, + "step": 6044 + }, + { + "epoch": 0.377875, + "grad_norm": 2.1875, + "grad_norm_var": 0.005810546875, + "learning_rate": 0.0001, + "loss": 7.102, + "loss/crossentropy": 2.1889522671699524, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20655735582113266, + "step": 6046 + }, + { + "epoch": 0.378, + "grad_norm": 2.296875, + "grad_norm_var": 0.006891886393229167, + "learning_rate": 0.0001, + "loss": 7.3464, + "loss/crossentropy": 2.486303687095642, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2155042290687561, + "step": 6048 + }, + { + "epoch": 0.378125, + "grad_norm": 2.234375, + "grad_norm_var": 0.006441243489583333, + "learning_rate": 0.0001, + "loss": 7.2726, + "loss/crossentropy": 2.2876516580581665, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22767490148544312, + "step": 6050 + }, + { + "epoch": 0.37825, + "grad_norm": 2.359375, + "grad_norm_var": 0.007868448893229166, + "learning_rate": 0.0001, + "loss": 7.2255, + "loss/crossentropy": 2.4651968479156494, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2104952037334442, + "step": 6052 + }, + { + "epoch": 0.378375, + "grad_norm": 2.234375, + "grad_norm_var": 0.005467732747395833, + "learning_rate": 0.0001, + "loss": 7.3932, + "loss/crossentropy": 2.672922730445862, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.24261310696601868, + "step": 6054 + }, + { + "epoch": 0.3785, + "grad_norm": 2.625, + "grad_norm_var": 0.0140625, + "learning_rate": 0.0001, + "loss": 7.1016, + "loss/crossentropy": 1.833503007888794, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.17841923236846924, + "step": 6056 + }, + { + "epoch": 0.378625, + "grad_norm": 1.96875, + "grad_norm_var": 0.01978759765625, + "learning_rate": 0.0001, + "loss": 7.203, + "loss/crossentropy": 2.2275290489196777, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20729243010282516, + "step": 6058 + }, + { + "epoch": 0.37875, + "grad_norm": 2.1875, + "grad_norm_var": 0.019050089518229167, + "learning_rate": 0.0001, + "loss": 7.2747, + "loss/crossentropy": 2.679691195487976, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21618877351284027, + "step": 6060 + }, + { + "epoch": 0.378875, + "grad_norm": 2.125, + "grad_norm_var": 0.0194488525390625, + "learning_rate": 0.0001, + "loss": 7.1795, + "loss/crossentropy": 2.263300061225891, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2042461857199669, + "step": 6062 + }, + { + "epoch": 0.379, + "grad_norm": 2.3125, + "grad_norm_var": 0.019416300455729167, + "learning_rate": 0.0001, + "loss": 7.398, + "loss/crossentropy": 2.6218732595443726, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2375633716583252, + "step": 6064 + }, + { + "epoch": 0.379125, + "grad_norm": 2.25, + "grad_norm_var": 0.019440714518229166, + "learning_rate": 0.0001, + "loss": 7.3822, + "loss/crossentropy": 2.30401611328125, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22317654639482498, + "step": 6066 + }, + { + "epoch": 0.37925, + "grad_norm": 2.421875, + "grad_norm_var": 0.023216756184895833, + "learning_rate": 0.0001, + "loss": 7.3373, + "loss/crossentropy": 1.9362152814865112, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1887020766735077, + "step": 6068 + }, + { + "epoch": 0.379375, + "grad_norm": 2.546875, + "grad_norm_var": 0.029313151041666666, + "learning_rate": 0.0001, + "loss": 7.5001, + "loss/crossentropy": 2.3842978477478027, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21501502394676208, + "step": 6070 + }, + { + "epoch": 0.3795, + "grad_norm": 2.34375, + "grad_norm_var": 0.023298136393229165, + "learning_rate": 0.0001, + "loss": 7.1358, + "loss/crossentropy": 2.22287917137146, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21563850343227386, + "step": 6072 + }, + { + "epoch": 0.379625, + "grad_norm": 2.265625, + "grad_norm_var": 0.017215983072916666, + "learning_rate": 0.0001, + "loss": 7.1006, + "loss/crossentropy": 2.402526021003723, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21390444040298462, + "step": 6074 + }, + { + "epoch": 0.37975, + "grad_norm": 1.9375, + "grad_norm_var": 0.0245513916015625, + "learning_rate": 0.0001, + "loss": 7.1442, + "loss/crossentropy": 2.188621401786804, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20676591992378235, + "step": 6076 + }, + { + "epoch": 0.379875, + "grad_norm": 2.34375, + "grad_norm_var": 0.022819010416666667, + "learning_rate": 0.0001, + "loss": 7.2357, + "loss/crossentropy": 2.382845878601074, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21810097247362137, + "step": 6078 + }, + { + "epoch": 0.38, + "grad_norm": 2.171875, + "grad_norm_var": 0.0234375, + "learning_rate": 0.0001, + "loss": 7.266, + "loss/crossentropy": 2.408275842666626, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21064983308315277, + "step": 6080 + }, + { + "epoch": 0.380125, + "grad_norm": 2.09375, + "grad_norm_var": 0.026611328125, + "learning_rate": 0.0001, + "loss": 7.1686, + "loss/crossentropy": 2.3628504276275635, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20509422570466995, + "step": 6082 + }, + { + "epoch": 0.38025, + "grad_norm": 2.15625, + "grad_norm_var": 0.022737630208333335, + "learning_rate": 0.0001, + "loss": 7.2245, + "loss/crossentropy": 2.246079921722412, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2075369581580162, + "step": 6084 + }, + { + "epoch": 0.380375, + "grad_norm": 2.03125, + "grad_norm_var": 0.016429646809895834, + "learning_rate": 0.0001, + "loss": 7.1207, + "loss/crossentropy": 2.012663960456848, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22296176850795746, + "step": 6086 + }, + { + "epoch": 0.3805, + "grad_norm": 2.109375, + "grad_norm_var": 0.0128082275390625, + "learning_rate": 0.0001, + "loss": 7.2217, + "loss/crossentropy": 2.5077545642852783, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2282715067267418, + "step": 6088 + }, + { + "epoch": 0.380625, + "grad_norm": 2.21875, + "grad_norm_var": 0.0127105712890625, + "learning_rate": 0.0001, + "loss": 7.1857, + "loss/crossentropy": 2.2846879959106445, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20731749385595322, + "step": 6090 + }, + { + "epoch": 0.38075, + "grad_norm": 2.21875, + "grad_norm_var": 0.0082916259765625, + "learning_rate": 0.0001, + "loss": 7.264, + "loss/crossentropy": 2.340460181236267, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21707424521446228, + "step": 6092 + }, + { + "epoch": 0.380875, + "grad_norm": 2.109375, + "grad_norm_var": 0.0072743733723958336, + "learning_rate": 0.0001, + "loss": 7.2139, + "loss/crossentropy": 2.317685127258301, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20413777977228165, + "step": 6094 + }, + { + "epoch": 0.381, + "grad_norm": 2.25, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 7.3123, + "loss/crossentropy": 2.2990000247955322, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22321767359972, + "step": 6096 + }, + { + "epoch": 0.381125, + "grad_norm": 2.296875, + "grad_norm_var": 0.006761678059895833, + "learning_rate": 0.0001, + "loss": 7.2415, + "loss/crossentropy": 2.307106852531433, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2068295031785965, + "step": 6098 + }, + { + "epoch": 0.38125, + "grad_norm": 2.09375, + "grad_norm_var": 0.009663899739583334, + "learning_rate": 0.0001, + "loss": 7.2459, + "loss/crossentropy": 2.272905468940735, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2071700543165207, + "step": 6100 + }, + { + "epoch": 0.381375, + "grad_norm": 2.140625, + "grad_norm_var": 0.007763671875, + "learning_rate": 0.0001, + "loss": 7.2205, + "loss/crossentropy": 2.2466858625411987, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21905604004859924, + "step": 6102 + }, + { + "epoch": 0.3815, + "grad_norm": 2.21875, + "grad_norm_var": 0.00625, + "learning_rate": 0.0001, + "loss": 7.3163, + "loss/crossentropy": 2.113400459289551, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20151030272245407, + "step": 6104 + }, + { + "epoch": 0.381625, + "grad_norm": 2.25, + "grad_norm_var": 0.00826416015625, + "learning_rate": 0.0001, + "loss": 7.2066, + "loss/crossentropy": 2.289364218711853, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2247104048728943, + "step": 6106 + }, + { + "epoch": 0.38175, + "grad_norm": 2.234375, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 7.2602, + "loss/crossentropy": 2.203786611557007, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21391122043132782, + "step": 6108 + }, + { + "epoch": 0.381875, + "grad_norm": 2.125, + "grad_norm_var": 0.011454264322916666, + "learning_rate": 0.0001, + "loss": 7.1077, + "loss/crossentropy": 2.2855257987976074, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.24084167182445526, + "step": 6110 + }, + { + "epoch": 0.382, + "grad_norm": 2.09375, + "grad_norm_var": 0.0131500244140625, + "learning_rate": 0.0001, + "loss": 7.1511, + "loss/crossentropy": 2.326382040977478, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22301767766475677, + "step": 6112 + }, + { + "epoch": 0.382125, + "grad_norm": 2.1875, + "grad_norm_var": 0.01357421875, + "learning_rate": 0.0001, + "loss": 7.3663, + "loss/crossentropy": 2.4644904136657715, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2120847851037979, + "step": 6114 + }, + { + "epoch": 0.38225, + "grad_norm": 2.046875, + "grad_norm_var": 0.017594401041666666, + "learning_rate": 0.0001, + "loss": 7.1899, + "loss/crossentropy": 1.9932494163513184, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.1996365636587143, + "step": 6116 + }, + { + "epoch": 0.382375, + "grad_norm": 2.171875, + "grad_norm_var": 0.017769368489583333, + "learning_rate": 0.0001, + "loss": 7.2331, + "loss/crossentropy": 2.2997413873672485, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21524758636951447, + "step": 6118 + }, + { + "epoch": 0.3825, + "grad_norm": 2.234375, + "grad_norm_var": 0.018277994791666665, + "learning_rate": 0.0001, + "loss": 7.1646, + "loss/crossentropy": 2.412147045135498, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20587334036827087, + "step": 6120 + }, + { + "epoch": 0.382625, + "grad_norm": 2.34375, + "grad_norm_var": 0.020100911458333332, + "learning_rate": 0.0001, + "loss": 7.1928, + "loss/crossentropy": 2.1714669466018677, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.22052258253097534, + "step": 6122 + }, + { + "epoch": 0.38275, + "grad_norm": 2.234375, + "grad_norm_var": 0.017887369791666666, + "learning_rate": 0.0001, + "loss": 7.2932, + "loss/crossentropy": 2.3583080768585205, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.232616625726223, + "step": 6124 + }, + { + "epoch": 0.382875, + "grad_norm": 2.234375, + "grad_norm_var": 0.016999308268229166, + "learning_rate": 0.0001, + "loss": 7.252, + "loss/crossentropy": 2.2754658460617065, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21504772454500198, + "step": 6126 + }, + { + "epoch": 0.383, + "grad_norm": 2.359375, + "grad_norm_var": 0.023509724934895834, + "learning_rate": 0.0001, + "loss": 7.1404, + "loss/crossentropy": 2.382510781288147, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22090370953083038, + "step": 6128 + }, + { + "epoch": 0.383125, + "grad_norm": 1.9921875, + "grad_norm_var": 0.027787017822265624, + "learning_rate": 0.0001, + "loss": 7.1187, + "loss/crossentropy": 2.0568565130233765, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19300862401723862, + "step": 6130 + }, + { + "epoch": 0.38325, + "grad_norm": 2.140625, + "grad_norm_var": 0.021183013916015625, + "learning_rate": 0.0001, + "loss": 7.2013, + "loss/crossentropy": 2.395463228225708, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21835564076900482, + "step": 6132 + }, + { + "epoch": 0.383375, + "grad_norm": 2.09375, + "grad_norm_var": 0.020715077718098957, + "learning_rate": 0.0001, + "loss": 7.2018, + "loss/crossentropy": 2.2619467973709106, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2159418836236, + "step": 6134 + }, + { + "epoch": 0.3835, + "grad_norm": 2.328125, + "grad_norm_var": 0.020918528238932293, + "learning_rate": 0.0001, + "loss": 7.4816, + "loss/crossentropy": 2.2685283422470093, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21295687556266785, + "step": 6136 + }, + { + "epoch": 0.383625, + "grad_norm": 2.25, + "grad_norm_var": 0.019311269124348957, + "learning_rate": 0.0001, + "loss": 7.1095, + "loss/crossentropy": 2.2552493810653687, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2095998227596283, + "step": 6138 + }, + { + "epoch": 0.38375, + "grad_norm": 2.203125, + "grad_norm_var": 0.020139312744140624, + "learning_rate": 0.0001, + "loss": 7.2686, + "loss/crossentropy": 2.3464255332946777, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20441381633281708, + "step": 6140 + }, + { + "epoch": 0.383875, + "grad_norm": 2.078125, + "grad_norm_var": 0.024421183268229167, + "learning_rate": 0.0001, + "loss": 7.2288, + "loss/crossentropy": 2.287395715713501, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22399558871984482, + "step": 6142 + }, + { + "epoch": 0.384, + "grad_norm": 2.203125, + "grad_norm_var": 0.011922200520833334, + "learning_rate": 0.0001, + "loss": 7.4251, + "loss/crossentropy": 2.164630949497223, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22558195143938065, + "step": 6144 + }, + { + "epoch": 0.384125, + "grad_norm": 2.203125, + "grad_norm_var": 0.010959625244140625, + "learning_rate": 0.0001, + "loss": 7.1384, + "loss/crossentropy": 1.937812328338623, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.19157906621694565, + "step": 6146 + }, + { + "epoch": 0.38425, + "grad_norm": 2.359375, + "grad_norm_var": 0.013787587483723959, + "learning_rate": 0.0001, + "loss": 7.2158, + "loss/crossentropy": 2.3608381748199463, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.24235235154628754, + "step": 6148 + }, + { + "epoch": 0.384375, + "grad_norm": 2.265625, + "grad_norm_var": 0.014587148030598959, + "learning_rate": 0.0001, + "loss": 7.1659, + "loss/crossentropy": 2.4223140478134155, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21669473499059677, + "step": 6150 + }, + { + "epoch": 0.3845, + "grad_norm": 2.265625, + "grad_norm_var": 0.017937978108723957, + "learning_rate": 0.0001, + "loss": 7.0478, + "loss/crossentropy": 2.2232367992401123, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21520529687404633, + "step": 6152 + }, + { + "epoch": 0.384625, + "grad_norm": 1.984375, + "grad_norm_var": 0.021602121988932292, + "learning_rate": 0.0001, + "loss": 7.2573, + "loss/crossentropy": 2.498006224632263, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22033587098121643, + "step": 6154 + }, + { + "epoch": 0.38475, + "grad_norm": 2.40625, + "grad_norm_var": 0.02343928019205729, + "learning_rate": 0.0001, + "loss": 7.2942, + "loss/crossentropy": 2.367275357246399, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21903083473443985, + "step": 6156 + }, + { + "epoch": 0.384875, + "grad_norm": 2.234375, + "grad_norm_var": 0.0179351806640625, + "learning_rate": 0.0001, + "loss": 7.1205, + "loss/crossentropy": 2.4437015056610107, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21279380470514297, + "step": 6158 + }, + { + "epoch": 0.385, + "grad_norm": 2.0625, + "grad_norm_var": 0.021907552083333334, + "learning_rate": 0.0001, + "loss": 7.247, + "loss/crossentropy": 2.4123772382736206, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22339122742414474, + "step": 6160 + }, + { + "epoch": 0.385125, + "grad_norm": 2.375, + "grad_norm_var": 0.023811848958333333, + "learning_rate": 0.0001, + "loss": 7.186, + "loss/crossentropy": 2.2593997716903687, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2121337652206421, + "step": 6162 + }, + { + "epoch": 0.38525, + "grad_norm": 2.03125, + "grad_norm_var": 0.024283854166666667, + "learning_rate": 0.0001, + "loss": 7.2505, + "loss/crossentropy": 2.4139418601989746, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2320389673113823, + "step": 6164 + }, + { + "epoch": 0.385375, + "grad_norm": 2.234375, + "grad_norm_var": 0.023021443684895834, + "learning_rate": 0.0001, + "loss": 7.1636, + "loss/crossentropy": 2.300944983959198, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21375388652086258, + "step": 6166 + }, + { + "epoch": 0.3855, + "grad_norm": 2.015625, + "grad_norm_var": 0.0188140869140625, + "learning_rate": 0.0001, + "loss": 7.1753, + "loss/crossentropy": 2.328981041908264, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2163231372833252, + "step": 6168 + }, + { + "epoch": 0.385625, + "grad_norm": 2.28125, + "grad_norm_var": 0.016825358072916668, + "learning_rate": 0.0001, + "loss": 7.257, + "loss/crossentropy": 2.253252863883972, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1982560232281685, + "step": 6170 + }, + { + "epoch": 0.38575, + "grad_norm": 2.140625, + "grad_norm_var": 0.01328125, + "learning_rate": 0.0001, + "loss": 7.1984, + "loss/crossentropy": 2.4192394018173218, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20848099142313004, + "step": 6172 + }, + { + "epoch": 0.385875, + "grad_norm": 2.140625, + "grad_norm_var": 0.013410441080729167, + "learning_rate": 0.0001, + "loss": 7.1191, + "loss/crossentropy": 2.1820013523101807, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2143269181251526, + "step": 6174 + }, + { + "epoch": 0.386, + "grad_norm": 2.296875, + "grad_norm_var": 0.013410441080729167, + "learning_rate": 0.0001, + "loss": 7.0709, + "loss/crossentropy": 2.271676182746887, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.208109050989151, + "step": 6176 + }, + { + "epoch": 0.386125, + "grad_norm": 2.09375, + "grad_norm_var": 0.008935546875, + "learning_rate": 0.0001, + "loss": 7.0798, + "loss/crossentropy": 2.139783024787903, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2209583893418312, + "step": 6178 + }, + { + "epoch": 0.38625, + "grad_norm": 2.359375, + "grad_norm_var": 0.01142578125, + "learning_rate": 0.0001, + "loss": 7.1879, + "loss/crossentropy": 2.22269070148468, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20203150063753128, + "step": 6180 + }, + { + "epoch": 0.386375, + "grad_norm": 2.171875, + "grad_norm_var": 0.015510813395182291, + "learning_rate": 0.0001, + "loss": 7.1266, + "loss/crossentropy": 2.225795269012451, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.20338183641433716, + "step": 6182 + }, + { + "epoch": 0.3865, + "grad_norm": 2.125, + "grad_norm_var": 0.013512929280598959, + "learning_rate": 0.0001, + "loss": 7.1648, + "loss/crossentropy": 2.114717125892639, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22314286977052689, + "step": 6184 + }, + { + "epoch": 0.386625, + "grad_norm": 2.09375, + "grad_norm_var": 0.013219960530598958, + "learning_rate": 0.0001, + "loss": 7.1101, + "loss/crossentropy": 2.0499364137649536, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.20304933190345764, + "step": 6186 + }, + { + "epoch": 0.38675, + "grad_norm": 2.125, + "grad_norm_var": 0.013002268473307292, + "learning_rate": 0.0001, + "loss": 7.1289, + "loss/crossentropy": 2.2222620248794556, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2161303237080574, + "step": 6188 + }, + { + "epoch": 0.386875, + "grad_norm": 2.1875, + "grad_norm_var": 0.012294260660807292, + "learning_rate": 0.0001, + "loss": 7.1987, + "loss/crossentropy": 2.296729564666748, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2208261713385582, + "step": 6190 + }, + { + "epoch": 0.387, + "grad_norm": 2.3125, + "grad_norm_var": 0.016477203369140624, + "learning_rate": 0.0001, + "loss": 7.4317, + "loss/crossentropy": 2.4254910945892334, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2514353543519974, + "step": 6192 + }, + { + "epoch": 0.387125, + "grad_norm": 2.109375, + "grad_norm_var": 0.01654230753580729, + "learning_rate": 0.0001, + "loss": 7.1742, + "loss/crossentropy": 2.1946414709091187, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20449410378932953, + "step": 6194 + }, + { + "epoch": 0.38725, + "grad_norm": 2.109375, + "grad_norm_var": 0.012839508056640626, + "learning_rate": 0.0001, + "loss": 7.1523, + "loss/crossentropy": 2.1898844242095947, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2156129777431488, + "step": 6196 + }, + { + "epoch": 0.387375, + "grad_norm": 2.328125, + "grad_norm_var": 0.0109527587890625, + "learning_rate": 0.0001, + "loss": 7.158, + "loss/crossentropy": 2.100374698638916, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21593213081359863, + "step": 6198 + }, + { + "epoch": 0.3875, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014564768473307291, + "learning_rate": 0.0001, + "loss": 7.0778, + "loss/crossentropy": 2.119781017303467, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19760458916425705, + "step": 6200 + }, + { + "epoch": 0.387625, + "grad_norm": 2.265625, + "grad_norm_var": 0.014088694254557292, + "learning_rate": 0.0001, + "loss": 7.1764, + "loss/crossentropy": 2.4633294343948364, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20755568891763687, + "step": 6202 + }, + { + "epoch": 0.38775, + "grad_norm": 2.15625, + "grad_norm_var": 0.014168039957682291, + "learning_rate": 0.0001, + "loss": 7.2959, + "loss/crossentropy": 2.351404905319214, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21977319568395615, + "step": 6204 + }, + { + "epoch": 0.387875, + "grad_norm": 2.171875, + "grad_norm_var": 0.014178212483723958, + "learning_rate": 0.0001, + "loss": 7.2006, + "loss/crossentropy": 2.1882619857788086, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2126511111855507, + "step": 6206 + }, + { + "epoch": 0.388, + "grad_norm": 2.015625, + "grad_norm_var": 0.009747060139973958, + "learning_rate": 0.0001, + "loss": 7.1983, + "loss/crossentropy": 2.2526434659957886, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2065277025103569, + "step": 6208 + }, + { + "epoch": 0.388125, + "grad_norm": 2.09375, + "grad_norm_var": 0.009854888916015625, + "learning_rate": 0.0001, + "loss": 7.1293, + "loss/crossentropy": 2.3055331707000732, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2006327137351036, + "step": 6210 + }, + { + "epoch": 0.38825, + "grad_norm": 2.234375, + "grad_norm_var": 0.010575103759765624, + "learning_rate": 0.0001, + "loss": 7.2887, + "loss/crossentropy": 2.175672173500061, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21848966926336288, + "step": 6212 + }, + { + "epoch": 0.388375, + "grad_norm": 2.40625, + "grad_norm_var": 0.013836415608723958, + "learning_rate": 0.0001, + "loss": 7.2577, + "loss/crossentropy": 2.3525902032852173, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22647082805633545, + "step": 6214 + }, + { + "epoch": 0.3885, + "grad_norm": 2.1875, + "grad_norm_var": 0.020490519205729165, + "learning_rate": 0.0001, + "loss": 7.279, + "loss/crossentropy": 2.0665590167045593, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.19694138318300247, + "step": 6216 + }, + { + "epoch": 0.388625, + "grad_norm": 2.125, + "grad_norm_var": 0.022809855143229165, + "learning_rate": 0.0001, + "loss": 7.233, + "loss/crossentropy": 2.0745668411254883, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20363005250692368, + "step": 6218 + }, + { + "epoch": 0.38875, + "grad_norm": 2.53125, + "grad_norm_var": 0.028922526041666667, + "learning_rate": 0.0001, + "loss": 7.3839, + "loss/crossentropy": 2.038256347179413, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21672368794679642, + "step": 6220 + }, + { + "epoch": 0.388875, + "grad_norm": 3.0, + "grad_norm_var": 0.06468098958333333, + "learning_rate": 0.0001, + "loss": 7.2287, + "loss/crossentropy": 2.2980682849884033, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22767220437526703, + "step": 6222 + }, + { + "epoch": 0.389, + "grad_norm": 2.03125, + "grad_norm_var": 0.07187398274739583, + "learning_rate": 0.0001, + "loss": 7.1803, + "loss/crossentropy": 2.175195574760437, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20187747478485107, + "step": 6224 + }, + { + "epoch": 0.389125, + "grad_norm": 2.1875, + "grad_norm_var": 0.07619400024414062, + "learning_rate": 0.0001, + "loss": 7.0902, + "loss/crossentropy": 2.0487667322158813, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19584185630083084, + "step": 6226 + }, + { + "epoch": 0.38925, + "grad_norm": 2.3125, + "grad_norm_var": 0.0768450419108073, + "learning_rate": 0.0001, + "loss": 7.3122, + "loss/crossentropy": 2.1507842540740967, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2099941298365593, + "step": 6228 + }, + { + "epoch": 0.389375, + "grad_norm": 2.265625, + "grad_norm_var": 0.07747573852539062, + "learning_rate": 0.0001, + "loss": 7.3023, + "loss/crossentropy": 2.536380410194397, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23754464834928513, + "step": 6230 + }, + { + "epoch": 0.3895, + "grad_norm": 2.09375, + "grad_norm_var": 0.07687149047851563, + "learning_rate": 0.0001, + "loss": 7.1825, + "loss/crossentropy": 2.354156494140625, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22577545046806335, + "step": 6232 + }, + { + "epoch": 0.389625, + "grad_norm": 2.234375, + "grad_norm_var": 0.07278416951497396, + "learning_rate": 0.0001, + "loss": 7.1849, + "loss/crossentropy": 2.0844647884368896, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2019355520606041, + "step": 6234 + }, + { + "epoch": 0.38975, + "grad_norm": 2.203125, + "grad_norm_var": 0.06987279256184896, + "learning_rate": 0.0001, + "loss": 7.1985, + "loss/crossentropy": 2.2951064109802246, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20773043483495712, + "step": 6236 + }, + { + "epoch": 0.389875, + "grad_norm": 2.25, + "grad_norm_var": 0.031109364827473958, + "learning_rate": 0.0001, + "loss": 7.227, + "loss/crossentropy": 2.1024820804595947, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2022949606180191, + "step": 6238 + }, + { + "epoch": 0.39, + "grad_norm": 2.09375, + "grad_norm_var": 0.014829254150390625, + "learning_rate": 0.0001, + "loss": 7.149, + "loss/crossentropy": 2.028247654438019, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.19261081516742706, + "step": 6240 + }, + { + "epoch": 0.390125, + "grad_norm": 2.953125, + "grad_norm_var": 0.04890848795572917, + "learning_rate": 0.0001, + "loss": 7.3008, + "loss/crossentropy": 2.4559611082077026, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21937407553195953, + "step": 6242 + }, + { + "epoch": 0.39025, + "grad_norm": 2.046875, + "grad_norm_var": 0.050191243489583336, + "learning_rate": 0.0001, + "loss": 7.1446, + "loss/crossentropy": 2.094746768474579, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.18669873476028442, + "step": 6244 + }, + { + "epoch": 0.390375, + "grad_norm": 2.328125, + "grad_norm_var": 0.05080973307291667, + "learning_rate": 0.0001, + "loss": 7.3141, + "loss/crossentropy": 2.0901660323143005, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2054583877325058, + "step": 6246 + }, + { + "epoch": 0.3905, + "grad_norm": 2.265625, + "grad_norm_var": 0.04784749348958333, + "learning_rate": 0.0001, + "loss": 7.323, + "loss/crossentropy": 2.0414637327194214, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2149074748158455, + "step": 6248 + }, + { + "epoch": 0.390625, + "grad_norm": 2.3125, + "grad_norm_var": 0.04676106770833333, + "learning_rate": 0.0001, + "loss": 7.2818, + "loss/crossentropy": 2.477187752723694, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22341058403253555, + "step": 6250 + }, + { + "epoch": 0.39075, + "grad_norm": 2.171875, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 0.0001, + "loss": 7.0282, + "loss/crossentropy": 2.167757034301758, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19077523797750473, + "step": 6252 + }, + { + "epoch": 0.390875, + "grad_norm": 1.984375, + "grad_norm_var": 0.05100809733072917, + "learning_rate": 0.0001, + "loss": 7.2128, + "loss/crossentropy": 2.4444233179092407, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21900728344917297, + "step": 6254 + }, + { + "epoch": 0.391, + "grad_norm": 2.15625, + "grad_norm_var": 0.0484771728515625, + "learning_rate": 0.0001, + "loss": 7.1528, + "loss/crossentropy": 2.353291869163513, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21415339410305023, + "step": 6256 + }, + { + "epoch": 0.391125, + "grad_norm": 2.171875, + "grad_norm_var": 0.015192667643229166, + "learning_rate": 0.0001, + "loss": 7.091, + "loss/crossentropy": 2.2490928173065186, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2167230099439621, + "step": 6258 + }, + { + "epoch": 0.39125, + "grad_norm": 2.21875, + "grad_norm_var": 0.013704427083333333, + "learning_rate": 0.0001, + "loss": 7.3306, + "loss/crossentropy": 2.4165178537368774, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2405092567205429, + "step": 6260 + }, + { + "epoch": 0.391375, + "grad_norm": 5.4375, + "grad_norm_var": 0.8871378580729167, + "learning_rate": 0.0001, + "loss": 7.3884, + "loss/crossentropy": 2.242551565170288, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23066271841526031, + "step": 6262 + }, + { + "epoch": 0.3915, + "grad_norm": 13.0625, + "grad_norm_var": 7.71259765625, + "learning_rate": 0.0001, + "loss": 7.8466, + "loss/crossentropy": 2.4470694065093994, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.26077815890312195, + "step": 6264 + }, + { + "epoch": 0.391625, + "grad_norm": 2.625, + "grad_norm_var": 7.606929524739583, + "learning_rate": 0.0001, + "loss": 7.3717, + "loss/crossentropy": 2.3733749389648438, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23013299703598022, + "step": 6266 + }, + { + "epoch": 0.39175, + "grad_norm": 2.46875, + "grad_norm_var": 7.546891276041666, + "learning_rate": 0.0001, + "loss": 7.2616, + "loss/crossentropy": 1.922485888004303, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2019965946674347, + "step": 6268 + }, + { + "epoch": 0.391875, + "grad_norm": 2.015625, + "grad_norm_var": 7.520503743489583, + "learning_rate": 0.0001, + "loss": 7.1054, + "loss/crossentropy": 2.258482575416565, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22108732908964157, + "step": 6270 + }, + { + "epoch": 0.392, + "grad_norm": 2.515625, + "grad_norm_var": 7.477079264322916, + "learning_rate": 0.0001, + "loss": 7.0443, + "loss/crossentropy": 2.2347971200942993, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21819213032722473, + "step": 6272 + }, + { + "epoch": 0.392125, + "grad_norm": 2.15625, + "grad_norm_var": 7.528499348958333, + "learning_rate": 0.0001, + "loss": 7.2214, + "loss/crossentropy": 2.2647920846939087, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21231649070978165, + "step": 6274 + }, + { + "epoch": 0.39225, + "grad_norm": 2.390625, + "grad_norm_var": 7.526005045572917, + "learning_rate": 0.0001, + "loss": 7.4141, + "loss/crossentropy": 2.3626712560653687, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21143031865358353, + "step": 6276 + }, + { + "epoch": 0.392375, + "grad_norm": 2.515625, + "grad_norm_var": 7.242333984375, + "learning_rate": 0.0001, + "loss": 7.2001, + "loss/crossentropy": 2.018476188182831, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20615462213754654, + "step": 6278 + }, + { + "epoch": 0.3925, + "grad_norm": 2.09375, + "grad_norm_var": 0.07892252604166666, + "learning_rate": 0.0001, + "loss": 7.1264, + "loss/crossentropy": 2.347910761833191, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2185712829232216, + "step": 6280 + }, + { + "epoch": 0.392625, + "grad_norm": 2.1875, + "grad_norm_var": 0.027339680989583334, + "learning_rate": 0.0001, + "loss": 7.4132, + "loss/crossentropy": 2.291501462459564, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21928339451551437, + "step": 6282 + }, + { + "epoch": 0.39275, + "grad_norm": 2.234375, + "grad_norm_var": 0.0239166259765625, + "learning_rate": 0.0001, + "loss": 7.3247, + "loss/crossentropy": 2.402106761932373, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2256387323141098, + "step": 6284 + }, + { + "epoch": 0.392875, + "grad_norm": 2.234375, + "grad_norm_var": 0.020929972330729168, + "learning_rate": 0.0001, + "loss": 7.3261, + "loss/crossentropy": 2.5922293663024902, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23517914861440659, + "step": 6286 + }, + { + "epoch": 0.393, + "grad_norm": 2.0625, + "grad_norm_var": 0.018831380208333335, + "learning_rate": 0.0001, + "loss": 6.9919, + "loss/crossentropy": 1.85099458694458, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.19639435410499573, + "step": 6288 + }, + { + "epoch": 0.393125, + "grad_norm": 2.109375, + "grad_norm_var": 0.019466145833333334, + "learning_rate": 0.0001, + "loss": 7.1741, + "loss/crossentropy": 2.6446659564971924, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20629265159368515, + "step": 6290 + }, + { + "epoch": 0.39325, + "grad_norm": 2.1875, + "grad_norm_var": 0.0166656494140625, + "learning_rate": 0.0001, + "loss": 7.0529, + "loss/crossentropy": 2.020451545715332, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2004483938217163, + "step": 6292 + }, + { + "epoch": 0.393375, + "grad_norm": 2.296875, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 7.1559, + "loss/crossentropy": 2.37979257106781, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.23029006272554398, + "step": 6294 + }, + { + "epoch": 0.3935, + "grad_norm": 2.265625, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 7.1597, + "loss/crossentropy": 1.9197645783424377, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.18409430235624313, + "step": 6296 + }, + { + "epoch": 0.393625, + "grad_norm": 2.203125, + "grad_norm_var": 0.0141510009765625, + "learning_rate": 0.0001, + "loss": 7.2572, + "loss/crossentropy": 2.4349591732025146, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2220984399318695, + "step": 6298 + }, + { + "epoch": 0.39375, + "grad_norm": 2.140625, + "grad_norm_var": 0.018131256103515625, + "learning_rate": 0.0001, + "loss": 7.2722, + "loss/crossentropy": 2.2742892503738403, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2200971096754074, + "step": 6300 + }, + { + "epoch": 0.393875, + "grad_norm": 2.25, + "grad_norm_var": 0.01762669881184896, + "learning_rate": 0.0001, + "loss": 7.1513, + "loss/crossentropy": 2.224164128303528, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21906382590532303, + "step": 6302 + }, + { + "epoch": 0.394, + "grad_norm": 2.203125, + "grad_norm_var": 0.01935399373372396, + "learning_rate": 0.0001, + "loss": 7.4193, + "loss/crossentropy": 2.4383881092071533, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21790315955877304, + "step": 6304 + }, + { + "epoch": 0.394125, + "grad_norm": 2.0625, + "grad_norm_var": 0.020763905843098958, + "learning_rate": 0.0001, + "loss": 7.258, + "loss/crossentropy": 2.2842483520507812, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21311689168214798, + "step": 6306 + }, + { + "epoch": 0.39425, + "grad_norm": 2.171875, + "grad_norm_var": 0.022474924723307293, + "learning_rate": 0.0001, + "loss": 7.2049, + "loss/crossentropy": 2.2808109521865845, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2112395316362381, + "step": 6308 + }, + { + "epoch": 0.394375, + "grad_norm": 2.34375, + "grad_norm_var": 0.023158518473307292, + "learning_rate": 0.0001, + "loss": 7.0865, + "loss/crossentropy": 2.1974657773971558, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20438392460346222, + "step": 6310 + }, + { + "epoch": 0.3945, + "grad_norm": 2.125, + "grad_norm_var": 0.034126536051432295, + "learning_rate": 0.0001, + "loss": 7.1925, + "loss/crossentropy": 2.0605512261390686, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20864906907081604, + "step": 6312 + }, + { + "epoch": 0.394625, + "grad_norm": 2.171875, + "grad_norm_var": 0.029797108968098958, + "learning_rate": 0.0001, + "loss": 7.0857, + "loss/crossentropy": 2.132881999015808, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19599150866270065, + "step": 6314 + }, + { + "epoch": 0.39475, + "grad_norm": 2.09375, + "grad_norm_var": 0.025439453125, + "learning_rate": 0.0001, + "loss": 7.2386, + "loss/crossentropy": 2.266274333000183, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.234784334897995, + "step": 6316 + }, + { + "epoch": 0.394875, + "grad_norm": 2.15625, + "grad_norm_var": 0.025397745768229167, + "learning_rate": 0.0001, + "loss": 7.3488, + "loss/crossentropy": 2.5363104343414307, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2150138020515442, + "step": 6318 + }, + { + "epoch": 0.395, + "grad_norm": 2.140625, + "grad_norm_var": 0.027229817708333333, + "learning_rate": 0.0001, + "loss": 7.2938, + "loss/crossentropy": 2.313738703727722, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21550405770540237, + "step": 6320 + }, + { + "epoch": 0.395125, + "grad_norm": 2.03125, + "grad_norm_var": 0.02779541015625, + "learning_rate": 0.0001, + "loss": 7.1882, + "loss/crossentropy": 2.285131096839905, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20362475514411926, + "step": 6322 + }, + { + "epoch": 0.39525, + "grad_norm": 2.203125, + "grad_norm_var": 0.026790364583333334, + "learning_rate": 0.0001, + "loss": 7.2794, + "loss/crossentropy": 2.522578239440918, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2191583439707756, + "step": 6324 + }, + { + "epoch": 0.395375, + "grad_norm": 2.203125, + "grad_norm_var": 0.025755818684895834, + "learning_rate": 0.0001, + "loss": 7.1791, + "loss/crossentropy": 2.460786819458008, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2294606864452362, + "step": 6326 + }, + { + "epoch": 0.3955, + "grad_norm": 2.140625, + "grad_norm_var": 0.01246337890625, + "learning_rate": 0.0001, + "loss": 7.1824, + "loss/crossentropy": 2.282975435256958, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.22038684040308, + "step": 6328 + }, + { + "epoch": 0.395625, + "grad_norm": 2.21875, + "grad_norm_var": 0.012165323893229166, + "learning_rate": 0.0001, + "loss": 7.129, + "loss/crossentropy": 2.178337335586548, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2119307518005371, + "step": 6330 + }, + { + "epoch": 0.39575, + "grad_norm": 2.328125, + "grad_norm_var": 0.013981119791666666, + "learning_rate": 0.0001, + "loss": 7.311, + "loss/crossentropy": 2.2752076387405396, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20718948543071747, + "step": 6332 + }, + { + "epoch": 0.395875, + "grad_norm": 2.625, + "grad_norm_var": 0.025113932291666665, + "learning_rate": 0.0001, + "loss": 7.2538, + "loss/crossentropy": 2.2171541452407837, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2145935222506523, + "step": 6334 + }, + { + "epoch": 0.396, + "grad_norm": 2.3125, + "grad_norm_var": 0.03355204264322917, + "learning_rate": 0.0001, + "loss": 7.3479, + "loss/crossentropy": 2.221325159072876, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22985991835594177, + "step": 6336 + }, + { + "epoch": 0.396125, + "grad_norm": 2.203125, + "grad_norm_var": 0.028425089518229165, + "learning_rate": 0.0001, + "loss": 7.2308, + "loss/crossentropy": 2.456564784049988, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21483594179153442, + "step": 6338 + }, + { + "epoch": 0.39625, + "grad_norm": 2.15625, + "grad_norm_var": 0.0310455322265625, + "learning_rate": 0.0001, + "loss": 7.1604, + "loss/crossentropy": 2.1799052953720093, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2172977179288864, + "step": 6340 + }, + { + "epoch": 0.396375, + "grad_norm": 2.15625, + "grad_norm_var": 0.0314453125, + "learning_rate": 0.0001, + "loss": 7.3171, + "loss/crossentropy": 2.308402419090271, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21601353585720062, + "step": 6342 + }, + { + "epoch": 0.3965, + "grad_norm": 2.3125, + "grad_norm_var": 0.03222249348958333, + "learning_rate": 0.0001, + "loss": 7.2262, + "loss/crossentropy": 2.40183162689209, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2093559354543686, + "step": 6344 + }, + { + "epoch": 0.396625, + "grad_norm": 1.984375, + "grad_norm_var": 0.034651692708333334, + "learning_rate": 0.0001, + "loss": 7.0315, + "loss/crossentropy": 1.9722952842712402, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19374338537454605, + "step": 6346 + }, + { + "epoch": 0.39675, + "grad_norm": 2.234375, + "grad_norm_var": 0.03699544270833333, + "learning_rate": 0.0001, + "loss": 7.1221, + "loss/crossentropy": 2.2486064434051514, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19635196030139923, + "step": 6348 + }, + { + "epoch": 0.396875, + "grad_norm": 2.15625, + "grad_norm_var": 0.025829060872395834, + "learning_rate": 0.0001, + "loss": 7.1707, + "loss/crossentropy": 2.4905707836151123, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.23304647207260132, + "step": 6350 + }, + { + "epoch": 0.397, + "grad_norm": 2.09375, + "grad_norm_var": 0.0089996337890625, + "learning_rate": 0.0001, + "loss": 7.1753, + "loss/crossentropy": 2.437414765357971, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2328067496418953, + "step": 6352 + }, + { + "epoch": 0.397125, + "grad_norm": 2.03125, + "grad_norm_var": 0.011942545572916666, + "learning_rate": 0.0001, + "loss": 7.0693, + "loss/crossentropy": 2.294760227203369, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21324169635772705, + "step": 6354 + }, + { + "epoch": 0.39725, + "grad_norm": 2.34375, + "grad_norm_var": 0.01558837890625, + "learning_rate": 0.0001, + "loss": 7.1353, + "loss/crossentropy": 2.355687439441681, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2057061642408371, + "step": 6356 + }, + { + "epoch": 0.397375, + "grad_norm": 1.96875, + "grad_norm_var": 0.0186920166015625, + "learning_rate": 0.0001, + "loss": 7.1041, + "loss/crossentropy": 2.3166561126708984, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19575699418783188, + "step": 6358 + }, + { + "epoch": 0.3975, + "grad_norm": 2.21875, + "grad_norm_var": 0.0171539306640625, + "learning_rate": 0.0001, + "loss": 6.97, + "loss/crossentropy": 2.255427122116089, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.19496920704841614, + "step": 6360 + }, + { + "epoch": 0.397625, + "grad_norm": 2.09375, + "grad_norm_var": 0.016926066080729166, + "learning_rate": 0.0001, + "loss": 7.276, + "loss/crossentropy": 2.5599944591522217, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.24714961647987366, + "step": 6362 + }, + { + "epoch": 0.39775, + "grad_norm": 2.234375, + "grad_norm_var": 0.01539306640625, + "learning_rate": 0.0001, + "loss": 7.1811, + "loss/crossentropy": 2.349897623062134, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21329647302627563, + "step": 6364 + }, + { + "epoch": 0.397875, + "grad_norm": 2.046875, + "grad_norm_var": 0.01549072265625, + "learning_rate": 0.0001, + "loss": 7.1841, + "loss/crossentropy": 2.190832495689392, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.219038225710392, + "step": 6366 + }, + { + "epoch": 0.398, + "grad_norm": 2.265625, + "grad_norm_var": 0.0158599853515625, + "learning_rate": 0.0001, + "loss": 7.2998, + "loss/crossentropy": 2.4349499940872192, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21405059099197388, + "step": 6368 + }, + { + "epoch": 0.398125, + "grad_norm": 2.234375, + "grad_norm_var": 0.01129150390625, + "learning_rate": 0.0001, + "loss": 7.2085, + "loss/crossentropy": 2.380038619041443, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21820517629384995, + "step": 6370 + }, + { + "epoch": 0.39825, + "grad_norm": 2.171875, + "grad_norm_var": 0.009186808268229167, + "learning_rate": 0.0001, + "loss": 7.1465, + "loss/crossentropy": 2.311740756034851, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20036467909812927, + "step": 6372 + }, + { + "epoch": 0.398375, + "grad_norm": 2.203125, + "grad_norm_var": 0.005387369791666667, + "learning_rate": 0.0001, + "loss": 7.0727, + "loss/crossentropy": 2.3335254192352295, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21295969933271408, + "step": 6374 + }, + { + "epoch": 0.3985, + "grad_norm": 2.109375, + "grad_norm_var": 0.00562744140625, + "learning_rate": 0.0001, + "loss": 7.2106, + "loss/crossentropy": 2.2149945497512817, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20361988991498947, + "step": 6376 + }, + { + "epoch": 0.398625, + "grad_norm": 2.140625, + "grad_norm_var": 0.005659993489583333, + "learning_rate": 0.0001, + "loss": 7.062, + "loss/crossentropy": 2.217359721660614, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.206251360476017, + "step": 6378 + }, + { + "epoch": 0.39875, + "grad_norm": 2.15625, + "grad_norm_var": 0.006461588541666666, + "learning_rate": 0.0001, + "loss": 7.0996, + "loss/crossentropy": 2.3303332328796387, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22268137335777283, + "step": 6380 + }, + { + "epoch": 0.398875, + "grad_norm": 2.125, + "grad_norm_var": 0.0055572509765625, + "learning_rate": 0.0001, + "loss": 7.263, + "loss/crossentropy": 2.365337371826172, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21782244741916656, + "step": 6382 + }, + { + "epoch": 0.399, + "grad_norm": 2.171875, + "grad_norm_var": 0.010190582275390625, + "learning_rate": 0.0001, + "loss": 7.1806, + "loss/crossentropy": 2.1229239106178284, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2005736082792282, + "step": 6384 + }, + { + "epoch": 0.399125, + "grad_norm": 2.203125, + "grad_norm_var": 0.009224192301432291, + "learning_rate": 0.0001, + "loss": 7.1557, + "loss/crossentropy": 2.10502552986145, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21637042611837387, + "step": 6386 + }, + { + "epoch": 0.39925, + "grad_norm": 2.296875, + "grad_norm_var": 0.010229237874348958, + "learning_rate": 0.0001, + "loss": 7.3129, + "loss/crossentropy": 2.272659182548523, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20518405735492706, + "step": 6388 + }, + { + "epoch": 0.399375, + "grad_norm": 2.109375, + "grad_norm_var": 0.011130523681640626, + "learning_rate": 0.0001, + "loss": 7.2501, + "loss/crossentropy": 2.0195122957229614, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.19787423312664032, + "step": 6390 + }, + { + "epoch": 0.3995, + "grad_norm": 2.015625, + "grad_norm_var": 0.012631988525390625, + "learning_rate": 0.0001, + "loss": 6.965, + "loss/crossentropy": 2.0998696088790894, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.18977724760770798, + "step": 6392 + }, + { + "epoch": 0.399625, + "grad_norm": 2.078125, + "grad_norm_var": 0.011926015218098959, + "learning_rate": 0.0001, + "loss": 7.1537, + "loss/crossentropy": 2.5123926401138306, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22189470380544662, + "step": 6394 + }, + { + "epoch": 0.39975, + "grad_norm": 2.546875, + "grad_norm_var": 0.02241999308268229, + "learning_rate": 0.0001, + "loss": 7.145, + "loss/crossentropy": 1.997516393661499, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.19417890161275864, + "step": 6396 + }, + { + "epoch": 0.399875, + "grad_norm": 2.0625, + "grad_norm_var": 0.023083241780598958, + "learning_rate": 0.0001, + "loss": 7.0326, + "loss/crossentropy": 2.183876097202301, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2068372145295143, + "step": 6398 + }, + { + "epoch": 0.4, + "grad_norm": 2.125, + "grad_norm_var": 0.021825154622395832, + "learning_rate": 0.0001, + "loss": 7.0986, + "loss/crossentropy": 2.2808289527893066, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21026340126991272, + "step": 6400 + }, + { + "epoch": 0.400125, + "grad_norm": 2.15625, + "grad_norm_var": 0.021761067708333335, + "learning_rate": 0.0001, + "loss": 7.2804, + "loss/crossentropy": 2.11923885345459, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2049631029367447, + "step": 6402 + }, + { + "epoch": 0.40025, + "grad_norm": 2.125, + "grad_norm_var": 0.020601399739583335, + "learning_rate": 0.0001, + "loss": 7.1831, + "loss/crossentropy": 2.2619943022727966, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.22322535514831543, + "step": 6404 + }, + { + "epoch": 0.400375, + "grad_norm": 1.9921875, + "grad_norm_var": 0.02127049763997396, + "learning_rate": 0.0001, + "loss": 7.0207, + "loss/crossentropy": 1.81930810213089, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.18869058787822723, + "step": 6406 + }, + { + "epoch": 0.4005, + "grad_norm": 2.296875, + "grad_norm_var": 0.02247899373372396, + "learning_rate": 0.0001, + "loss": 7.3292, + "loss/crossentropy": 2.421749472618103, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21702227741479874, + "step": 6408 + }, + { + "epoch": 0.400625, + "grad_norm": 1.9921875, + "grad_norm_var": 0.024112955729166666, + "learning_rate": 0.0001, + "loss": 7.2174, + "loss/crossentropy": 2.363996148109436, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20497220754623413, + "step": 6410 + }, + { + "epoch": 0.40075, + "grad_norm": 2.1875, + "grad_norm_var": 0.011481730143229167, + "learning_rate": 0.0001, + "loss": 7.2747, + "loss/crossentropy": 2.1107038259506226, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2324059084057808, + "step": 6412 + }, + { + "epoch": 0.400875, + "grad_norm": 2.015625, + "grad_norm_var": 0.011673990885416667, + "learning_rate": 0.0001, + "loss": 7.1866, + "loss/crossentropy": 2.1150999069213867, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2122773751616478, + "step": 6414 + }, + { + "epoch": 0.401, + "grad_norm": 2.40625, + "grad_norm_var": 0.013849894205729166, + "learning_rate": 0.0001, + "loss": 7.2246, + "loss/crossentropy": 2.143269181251526, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2062653973698616, + "step": 6416 + }, + { + "epoch": 0.401125, + "grad_norm": 2.15625, + "grad_norm_var": 0.018065388997395834, + "learning_rate": 0.0001, + "loss": 7.317, + "loss/crossentropy": 2.1853350400924683, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.1954660639166832, + "step": 6418 + }, + { + "epoch": 0.40125, + "grad_norm": 2.109375, + "grad_norm_var": 0.018114217122395835, + "learning_rate": 0.0001, + "loss": 7.1075, + "loss/crossentropy": 2.1452205181121826, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2259805053472519, + "step": 6420 + }, + { + "epoch": 0.401375, + "grad_norm": 2.15625, + "grad_norm_var": 0.015215810139973958, + "learning_rate": 0.0001, + "loss": 7.1589, + "loss/crossentropy": 2.5433114767074585, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20600835233926773, + "step": 6422 + }, + { + "epoch": 0.4015, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016356404622395834, + "learning_rate": 0.0001, + "loss": 7.0635, + "loss/crossentropy": 1.9214220643043518, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.18052654713392258, + "step": 6424 + }, + { + "epoch": 0.401625, + "grad_norm": 2.3125, + "grad_norm_var": 0.015728505452473958, + "learning_rate": 0.0001, + "loss": 7.2356, + "loss/crossentropy": 2.359765887260437, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22128498554229736, + "step": 6426 + }, + { + "epoch": 0.40175, + "grad_norm": 2.0625, + "grad_norm_var": 0.017470041910807293, + "learning_rate": 0.0001, + "loss": 7.2596, + "loss/crossentropy": 2.2433066368103027, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2121361345052719, + "step": 6428 + }, + { + "epoch": 0.401875, + "grad_norm": 2.5, + "grad_norm_var": 0.022226715087890626, + "learning_rate": 0.0001, + "loss": 7.2592, + "loss/crossentropy": 2.251939296722412, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21200115978717804, + "step": 6430 + }, + { + "epoch": 0.402, + "grad_norm": 2.1875, + "grad_norm_var": 0.019909413655598958, + "learning_rate": 0.0001, + "loss": 7.1853, + "loss/crossentropy": 2.20802104473114, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2019861936569214, + "step": 6432 + }, + { + "epoch": 0.402125, + "grad_norm": 2.25, + "grad_norm_var": 0.01645075480143229, + "learning_rate": 0.0001, + "loss": 7.1294, + "loss/crossentropy": 2.509099006652832, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22067614644765854, + "step": 6434 + }, + { + "epoch": 0.40225, + "grad_norm": 2.515625, + "grad_norm_var": 0.025785319010416665, + "learning_rate": 0.0001, + "loss": 7.0552, + "loss/crossentropy": 2.2592573165893555, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21897969394922256, + "step": 6436 + }, + { + "epoch": 0.402375, + "grad_norm": 2.296875, + "grad_norm_var": 0.027347819010416666, + "learning_rate": 0.0001, + "loss": 7.3063, + "loss/crossentropy": 2.221360445022583, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21196063607931137, + "step": 6438 + }, + { + "epoch": 0.4025, + "grad_norm": 2.03125, + "grad_norm_var": 0.025294748942057292, + "learning_rate": 0.0001, + "loss": 7.1702, + "loss/crossentropy": 2.428785562515259, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22930145263671875, + "step": 6440 + }, + { + "epoch": 0.402625, + "grad_norm": 2.09375, + "grad_norm_var": 0.02661921183268229, + "learning_rate": 0.0001, + "loss": 7.2067, + "loss/crossentropy": 2.287087917327881, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.18991201370954514, + "step": 6442 + }, + { + "epoch": 0.40275, + "grad_norm": 2.09375, + "grad_norm_var": 0.02487767537434896, + "learning_rate": 0.0001, + "loss": 7.0066, + "loss/crossentropy": 2.274364471435547, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21028787642717361, + "step": 6444 + }, + { + "epoch": 0.402875, + "grad_norm": 2.40625, + "grad_norm_var": 0.02175877888997396, + "learning_rate": 0.0001, + "loss": 7.1773, + "loss/crossentropy": 2.046273946762085, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19751162081956863, + "step": 6446 + }, + { + "epoch": 0.403, + "grad_norm": 2.140625, + "grad_norm_var": 0.023178863525390624, + "learning_rate": 0.0001, + "loss": 7.2427, + "loss/crossentropy": 2.1227652430534363, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.19309522956609726, + "step": 6448 + }, + { + "epoch": 0.403125, + "grad_norm": 2.203125, + "grad_norm_var": 0.02542088826497396, + "learning_rate": 0.0001, + "loss": 7.0169, + "loss/crossentropy": 2.1013576984405518, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2055366113781929, + "step": 6450 + }, + { + "epoch": 0.40325, + "grad_norm": 2.234375, + "grad_norm_var": 0.016097005208333334, + "learning_rate": 0.0001, + "loss": 7.2335, + "loss/crossentropy": 2.3643540143966675, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21434146910905838, + "step": 6452 + }, + { + "epoch": 0.403375, + "grad_norm": 2.171875, + "grad_norm_var": 0.013102213541666666, + "learning_rate": 0.0001, + "loss": 7.1063, + "loss/crossentropy": 2.2940242290496826, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21527153253555298, + "step": 6454 + }, + { + "epoch": 0.4035, + "grad_norm": 2.015625, + "grad_norm_var": 0.01363525390625, + "learning_rate": 0.0001, + "loss": 7.1233, + "loss/crossentropy": 2.432578206062317, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21458515524864197, + "step": 6456 + }, + { + "epoch": 0.403625, + "grad_norm": 2.515625, + "grad_norm_var": 0.0198150634765625, + "learning_rate": 0.0001, + "loss": 7.161, + "loss/crossentropy": 2.0361666083335876, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21390791982412338, + "step": 6458 + }, + { + "epoch": 0.40375, + "grad_norm": 2.28125, + "grad_norm_var": 0.0194000244140625, + "learning_rate": 0.0001, + "loss": 7.1093, + "loss/crossentropy": 2.171969771385193, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.22019729018211365, + "step": 6460 + }, + { + "epoch": 0.403875, + "grad_norm": 2.15625, + "grad_norm_var": 0.0191314697265625, + "learning_rate": 0.0001, + "loss": 7.1831, + "loss/crossentropy": 2.086853861808777, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21678819507360458, + "step": 6462 + }, + { + "epoch": 0.404, + "grad_norm": 2.078125, + "grad_norm_var": 0.017015584309895835, + "learning_rate": 0.0001, + "loss": 7.0841, + "loss/crossentropy": 2.077217400074005, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.1952114701271057, + "step": 6464 + }, + { + "epoch": 0.404125, + "grad_norm": 2.390625, + "grad_norm_var": 0.018431599934895834, + "learning_rate": 0.0001, + "loss": 7.3492, + "loss/crossentropy": 2.162355422973633, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20112480223178864, + "step": 6466 + }, + { + "epoch": 0.40425, + "grad_norm": 2.296875, + "grad_norm_var": 0.018895467122395832, + "learning_rate": 0.0001, + "loss": 7.2183, + "loss/crossentropy": 2.3603663444519043, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22803721576929092, + "step": 6468 + }, + { + "epoch": 0.404375, + "grad_norm": 2.265625, + "grad_norm_var": 0.0194976806640625, + "learning_rate": 0.0001, + "loss": 7.4027, + "loss/crossentropy": 2.0049321055412292, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22556506842374802, + "step": 6470 + }, + { + "epoch": 0.4045, + "grad_norm": 2.1875, + "grad_norm_var": 0.01796875, + "learning_rate": 0.0001, + "loss": 7.0518, + "loss/crossentropy": 2.002595067024231, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.19405024498701096, + "step": 6472 + }, + { + "epoch": 0.404625, + "grad_norm": 2.0625, + "grad_norm_var": 0.015262858072916666, + "learning_rate": 0.0001, + "loss": 7.2985, + "loss/crossentropy": 2.281522750854492, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21547602117061615, + "step": 6474 + }, + { + "epoch": 0.40475, + "grad_norm": 2.3125, + "grad_norm_var": 0.0160308837890625, + "learning_rate": 0.0001, + "loss": 7.0747, + "loss/crossentropy": 2.1680938005447388, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21121270209550858, + "step": 6476 + }, + { + "epoch": 0.404875, + "grad_norm": 2.109375, + "grad_norm_var": 0.016796875, + "learning_rate": 0.0001, + "loss": 7.1666, + "loss/crossentropy": 2.5115526914596558, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2365971878170967, + "step": 6478 + }, + { + "epoch": 0.405, + "grad_norm": 3.171875, + "grad_norm_var": 0.06926676432291666, + "learning_rate": 0.0001, + "loss": 7.2812, + "loss/crossentropy": 2.2533043026924133, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.224958136677742, + "step": 6480 + }, + { + "epoch": 0.405125, + "grad_norm": 1.84375, + "grad_norm_var": 0.07986653645833333, + "learning_rate": 0.0001, + "loss": 7.1225, + "loss/crossentropy": 2.2455263137817383, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2447589933872223, + "step": 6482 + }, + { + "epoch": 0.40525, + "grad_norm": 2.25, + "grad_norm_var": 0.0816070556640625, + "learning_rate": 0.0001, + "loss": 7.3032, + "loss/crossentropy": 1.951097846031189, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20029456168413162, + "step": 6484 + }, + { + "epoch": 0.405375, + "grad_norm": 2.109375, + "grad_norm_var": 0.08391520182291666, + "learning_rate": 0.0001, + "loss": 7.2667, + "loss/crossentropy": 2.215672254562378, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2087327167391777, + "step": 6486 + }, + { + "epoch": 0.4055, + "grad_norm": 2.21875, + "grad_norm_var": 0.08147379557291666, + "learning_rate": 0.0001, + "loss": 7.1273, + "loss/crossentropy": 1.8793167471885681, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2043544054031372, + "step": 6488 + }, + { + "epoch": 0.405625, + "grad_norm": 2.203125, + "grad_norm_var": 0.07893473307291667, + "learning_rate": 0.0001, + "loss": 7.2892, + "loss/crossentropy": 2.3511608839035034, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.20901018381118774, + "step": 6490 + }, + { + "epoch": 0.40575, + "grad_norm": 2.09375, + "grad_norm_var": 0.08166910807291666, + "learning_rate": 0.0001, + "loss": 7.1031, + "loss/crossentropy": 2.2664103507995605, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2170920968055725, + "step": 6492 + }, + { + "epoch": 0.405875, + "grad_norm": 1.953125, + "grad_norm_var": 0.08240559895833334, + "learning_rate": 0.0001, + "loss": 7.2317, + "loss/crossentropy": 2.4039368629455566, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2005300372838974, + "step": 6494 + }, + { + "epoch": 0.406, + "grad_norm": 2.328125, + "grad_norm_var": 0.018355305989583334, + "learning_rate": 0.0001, + "loss": 7.0248, + "loss/crossentropy": 1.9945274591445923, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19976937770843506, + "step": 6496 + }, + { + "epoch": 0.406125, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014977773030598959, + "learning_rate": 0.0001, + "loss": 7.0207, + "loss/crossentropy": 2.2633402347564697, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20823977887630463, + "step": 6498 + }, + { + "epoch": 0.40625, + "grad_norm": 2.265625, + "grad_norm_var": 0.01840184529622396, + "learning_rate": 0.0001, + "loss": 7.3655, + "loss/crossentropy": 2.1993072628974915, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2053944393992424, + "step": 6500 + }, + { + "epoch": 0.406375, + "grad_norm": 1.9296875, + "grad_norm_var": 0.022272745768229168, + "learning_rate": 0.0001, + "loss": 7.0907, + "loss/crossentropy": 2.3876596689224243, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20196551084518433, + "step": 6502 + }, + { + "epoch": 0.4065, + "grad_norm": 2.328125, + "grad_norm_var": 0.0234619140625, + "learning_rate": 0.0001, + "loss": 7.2996, + "loss/crossentropy": 2.2332963347434998, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.22670364379882812, + "step": 6504 + }, + { + "epoch": 0.406625, + "grad_norm": 2.078125, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 7.2239, + "loss/crossentropy": 2.4286776781082153, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21485131978988647, + "step": 6506 + }, + { + "epoch": 0.40675, + "grad_norm": 2.171875, + "grad_norm_var": 0.023444620768229167, + "learning_rate": 0.0001, + "loss": 7.132, + "loss/crossentropy": 2.1059845685958862, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20297736674547195, + "step": 6508 + }, + { + "epoch": 0.406875, + "grad_norm": 2.078125, + "grad_norm_var": 0.021336873372395832, + "learning_rate": 0.0001, + "loss": 7.3092, + "loss/crossentropy": 2.437384843826294, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21660450100898743, + "step": 6510 + }, + { + "epoch": 0.407, + "grad_norm": 2.390625, + "grad_norm_var": 0.0242431640625, + "learning_rate": 0.0001, + "loss": 7.3282, + "loss/crossentropy": 2.492189645767212, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.23497627675533295, + "step": 6512 + }, + { + "epoch": 0.407125, + "grad_norm": 2.09375, + "grad_norm_var": 0.021437327067057293, + "learning_rate": 0.0001, + "loss": 7.29, + "loss/crossentropy": 2.0942156314849854, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.21186655014753342, + "step": 6514 + }, + { + "epoch": 0.40725, + "grad_norm": 2.296875, + "grad_norm_var": 0.01834284464518229, + "learning_rate": 0.0001, + "loss": 7.2034, + "loss/crossentropy": 2.205212712287903, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23347996175289154, + "step": 6516 + }, + { + "epoch": 0.407375, + "grad_norm": 2.1875, + "grad_norm_var": 0.016682942708333332, + "learning_rate": 0.0001, + "loss": 7.2561, + "loss/crossentropy": 2.18733811378479, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19546174257993698, + "step": 6518 + }, + { + "epoch": 0.4075, + "grad_norm": 2.0625, + "grad_norm_var": 0.016185506184895834, + "learning_rate": 0.0001, + "loss": 7.0733, + "loss/crossentropy": 2.0966725945472717, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22100364416837692, + "step": 6520 + }, + { + "epoch": 0.407625, + "grad_norm": 2.140625, + "grad_norm_var": 0.014680989583333333, + "learning_rate": 0.0001, + "loss": 7.07, + "loss/crossentropy": 1.8778254985809326, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.2160200998187065, + "step": 6522 + }, + { + "epoch": 0.40775, + "grad_norm": 2.359375, + "grad_norm_var": 0.015071614583333334, + "learning_rate": 0.0001, + "loss": 7.1556, + "loss/crossentropy": 2.265491247177124, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.23102690279483795, + "step": 6524 + }, + { + "epoch": 0.407875, + "grad_norm": 2.296875, + "grad_norm_var": 0.018745930989583333, + "learning_rate": 0.0001, + "loss": 7.2781, + "loss/crossentropy": 2.2143566012382507, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2069038674235344, + "step": 6526 + }, + { + "epoch": 0.408, + "grad_norm": 2.21875, + "grad_norm_var": 0.014383951822916666, + "learning_rate": 0.0001, + "loss": 7.297, + "loss/crossentropy": 2.3243274688720703, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21643533557653427, + "step": 6528 + }, + { + "epoch": 0.408125, + "grad_norm": 2.3125, + "grad_norm_var": 0.014191691080729167, + "learning_rate": 0.0001, + "loss": 7.2748, + "loss/crossentropy": 2.3320904970169067, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2175091654062271, + "step": 6530 + }, + { + "epoch": 0.40825, + "grad_norm": 2.171875, + "grad_norm_var": 0.016087849934895832, + "learning_rate": 0.0001, + "loss": 7.2427, + "loss/crossentropy": 2.1425468921661377, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20631052553653717, + "step": 6532 + }, + { + "epoch": 0.408375, + "grad_norm": 2.515625, + "grad_norm_var": 0.0207183837890625, + "learning_rate": 0.0001, + "loss": 7.1293, + "loss/crossentropy": 2.483289361000061, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23001858592033386, + "step": 6534 + }, + { + "epoch": 0.4085, + "grad_norm": 2.09375, + "grad_norm_var": 0.0211822509765625, + "learning_rate": 0.0001, + "loss": 7.2254, + "loss/crossentropy": 2.507395625114441, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.20727375894784927, + "step": 6536 + }, + { + "epoch": 0.408625, + "grad_norm": 1.9921875, + "grad_norm_var": 0.030450185139973957, + "learning_rate": 0.0001, + "loss": 7.0344, + "loss/crossentropy": 2.13398277759552, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.18832530081272125, + "step": 6538 + }, + { + "epoch": 0.40875, + "grad_norm": 2.359375, + "grad_norm_var": 0.030755360921223957, + "learning_rate": 0.0001, + "loss": 7.1028, + "loss/crossentropy": 2.128291964530945, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2085724174976349, + "step": 6540 + }, + { + "epoch": 0.408875, + "grad_norm": 2.265625, + "grad_norm_var": 0.026468658447265626, + "learning_rate": 0.0001, + "loss": 7.3629, + "loss/crossentropy": 2.387086510658264, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.23265989124774933, + "step": 6542 + }, + { + "epoch": 0.409, + "grad_norm": 2.171875, + "grad_norm_var": 0.02826512654622396, + "learning_rate": 0.0001, + "loss": 7.2463, + "loss/crossentropy": 2.436495780944824, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.216141015291214, + "step": 6544 + }, + { + "epoch": 0.409125, + "grad_norm": 2.546875, + "grad_norm_var": 0.03478368123372396, + "learning_rate": 0.0001, + "loss": 7.322, + "loss/crossentropy": 2.2948137521743774, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2177954763174057, + "step": 6546 + }, + { + "epoch": 0.40925, + "grad_norm": 2.359375, + "grad_norm_var": 0.032195790608723955, + "learning_rate": 0.0001, + "loss": 7.3163, + "loss/crossentropy": 2.068236470222473, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.19945338368415833, + "step": 6548 + }, + { + "epoch": 0.409375, + "grad_norm": 2.234375, + "grad_norm_var": 0.026920318603515625, + "learning_rate": 0.0001, + "loss": 7.202, + "loss/crossentropy": 2.0515838861465454, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.1940801441669464, + "step": 6550 + }, + { + "epoch": 0.4095, + "grad_norm": 2.078125, + "grad_norm_var": 0.025221506754557293, + "learning_rate": 0.0001, + "loss": 7.0551, + "loss/crossentropy": 2.2386069297790527, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21453995257616043, + "step": 6552 + }, + { + "epoch": 0.409625, + "grad_norm": 2.140625, + "grad_norm_var": 0.018553670247395834, + "learning_rate": 0.0001, + "loss": 7.1711, + "loss/crossentropy": 2.238744616508484, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.208785742521286, + "step": 6554 + }, + { + "epoch": 0.40975, + "grad_norm": 2.078125, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 7.2699, + "loss/crossentropy": 2.372469902038574, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2197486311197281, + "step": 6556 + }, + { + "epoch": 0.409875, + "grad_norm": 2.296875, + "grad_norm_var": 0.018724568684895835, + "learning_rate": 0.0001, + "loss": 7.3083, + "loss/crossentropy": 2.257944345474243, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20727266371250153, + "step": 6558 + }, + { + "epoch": 0.41, + "grad_norm": 2.140625, + "grad_norm_var": 0.016162109375, + "learning_rate": 0.0001, + "loss": 7.2547, + "loss/crossentropy": 2.3854445219039917, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2205938622355461, + "step": 6560 + }, + { + "epoch": 0.410125, + "grad_norm": 2.171875, + "grad_norm_var": 0.0113922119140625, + "learning_rate": 0.0001, + "loss": 6.9946, + "loss/crossentropy": 2.099511981010437, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2024962157011032, + "step": 6562 + }, + { + "epoch": 0.41025, + "grad_norm": 2.015625, + "grad_norm_var": 0.0093414306640625, + "learning_rate": 0.0001, + "loss": 7.1571, + "loss/crossentropy": 2.1464951634407043, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21468877792358398, + "step": 6564 + }, + { + "epoch": 0.410375, + "grad_norm": 2.171875, + "grad_norm_var": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 7.2538, + "loss/crossentropy": 2.3937063217163086, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2055506482720375, + "step": 6566 + }, + { + "epoch": 0.4105, + "grad_norm": 2.09375, + "grad_norm_var": 0.010400390625, + "learning_rate": 0.0001, + "loss": 7.3326, + "loss/crossentropy": 2.384568929672241, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22990135848522186, + "step": 6568 + }, + { + "epoch": 0.410625, + "grad_norm": 2.328125, + "grad_norm_var": 0.012821451822916666, + "learning_rate": 0.0001, + "loss": 7.1655, + "loss/crossentropy": 2.2787879705429077, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.23064248263835907, + "step": 6570 + }, + { + "epoch": 0.41075, + "grad_norm": 2.109375, + "grad_norm_var": 0.012679036458333333, + "learning_rate": 0.0001, + "loss": 7.1028, + "loss/crossentropy": 2.1466062664985657, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20553258806467056, + "step": 6572 + }, + { + "epoch": 0.410875, + "grad_norm": 2.3125, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 7.3206, + "loss/crossentropy": 2.2528934478759766, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20717480778694153, + "step": 6574 + }, + { + "epoch": 0.411, + "grad_norm": 2.875, + "grad_norm_var": 0.04734598795572917, + "learning_rate": 0.0001, + "loss": 7.0194, + "loss/crossentropy": 1.7589277029037476, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.18202517926692963, + "step": 6576 + }, + { + "epoch": 0.411125, + "grad_norm": 1.9765625, + "grad_norm_var": 0.04872817993164062, + "learning_rate": 0.0001, + "loss": 7.148, + "loss/crossentropy": 1.9737151265144348, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.1905621960759163, + "step": 6578 + }, + { + "epoch": 0.41125, + "grad_norm": 2.546875, + "grad_norm_var": 0.055216217041015626, + "learning_rate": 0.0001, + "loss": 7.1603, + "loss/crossentropy": 2.265736222267151, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21055534482002258, + "step": 6580 + }, + { + "epoch": 0.411375, + "grad_norm": 2.046875, + "grad_norm_var": 0.054593658447265624, + "learning_rate": 0.0001, + "loss": 7.1553, + "loss/crossentropy": 2.3410770893096924, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22156669199466705, + "step": 6582 + }, + { + "epoch": 0.4115, + "grad_norm": 2.0625, + "grad_norm_var": 0.056955718994140626, + "learning_rate": 0.0001, + "loss": 7.2238, + "loss/crossentropy": 2.381894826889038, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21481861174106598, + "step": 6584 + }, + { + "epoch": 0.411625, + "grad_norm": 2.140625, + "grad_norm_var": 0.05864639282226562, + "learning_rate": 0.0001, + "loss": 7.2543, + "loss/crossentropy": 2.0804589986801147, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2047130987048149, + "step": 6586 + }, + { + "epoch": 0.41175, + "grad_norm": 2.15625, + "grad_norm_var": 0.055275217692057295, + "learning_rate": 0.0001, + "loss": 7.1031, + "loss/crossentropy": 2.2733839750289917, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21279729157686234, + "step": 6588 + }, + { + "epoch": 0.411875, + "grad_norm": 2.15625, + "grad_norm_var": 0.05738703409830729, + "learning_rate": 0.0001, + "loss": 7.2073, + "loss/crossentropy": 2.3205610513687134, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21465665102005005, + "step": 6590 + }, + { + "epoch": 0.412, + "grad_norm": 2.0625, + "grad_norm_var": 0.02643407185872396, + "learning_rate": 0.0001, + "loss": 7.1259, + "loss/crossentropy": 2.2790287733078003, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20951499789953232, + "step": 6592 + }, + { + "epoch": 0.412125, + "grad_norm": 2.484375, + "grad_norm_var": 0.028148396809895834, + "learning_rate": 0.0001, + "loss": 7.3196, + "loss/crossentropy": 2.3589893579483032, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21895043551921844, + "step": 6594 + }, + { + "epoch": 0.41225, + "grad_norm": 2.203125, + "grad_norm_var": 0.015413411458333333, + "learning_rate": 0.0001, + "loss": 6.9755, + "loss/crossentropy": 2.2003557682037354, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.1951805129647255, + "step": 6596 + }, + { + "epoch": 0.412375, + "grad_norm": 2.21875, + "grad_norm_var": 0.012206013997395833, + "learning_rate": 0.0001, + "loss": 7.2822, + "loss/crossentropy": 2.1846193075180054, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2042449563741684, + "step": 6598 + }, + { + "epoch": 0.4125, + "grad_norm": 2.1875, + "grad_norm_var": 0.011449178059895834, + "learning_rate": 0.0001, + "loss": 7.1855, + "loss/crossentropy": 2.1782132387161255, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2254723384976387, + "step": 6600 + }, + { + "epoch": 0.412625, + "grad_norm": 2.265625, + "grad_norm_var": 0.0118316650390625, + "learning_rate": 0.0001, + "loss": 7.2243, + "loss/crossentropy": 2.1209170818328857, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.22717957943677902, + "step": 6602 + }, + { + "epoch": 0.41275, + "grad_norm": 2.265625, + "grad_norm_var": 0.01217041015625, + "learning_rate": 0.0001, + "loss": 7.2421, + "loss/crossentropy": 2.050145149230957, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19752968847751617, + "step": 6604 + }, + { + "epoch": 0.412875, + "grad_norm": 2.234375, + "grad_norm_var": 0.012593587239583334, + "learning_rate": 0.0001, + "loss": 7.2494, + "loss/crossentropy": 2.1468405723571777, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2092445194721222, + "step": 6606 + }, + { + "epoch": 0.413, + "grad_norm": 2.0625, + "grad_norm_var": 0.012059529622395834, + "learning_rate": 0.0001, + "loss": 7.1083, + "loss/crossentropy": 2.13679301738739, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.20577143877744675, + "step": 6608 + }, + { + "epoch": 0.413125, + "grad_norm": 2.375, + "grad_norm_var": 0.008610026041666666, + "learning_rate": 0.0001, + "loss": 7.442, + "loss/crossentropy": 2.2817904949188232, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.20960254222154617, + "step": 6610 + }, + { + "epoch": 0.41325, + "grad_norm": 2.421875, + "grad_norm_var": 0.00849609375, + "learning_rate": 0.0001, + "loss": 7.2355, + "loss/crossentropy": 2.309928774833679, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2041112631559372, + "step": 6612 + }, + { + "epoch": 0.413375, + "grad_norm": 2.359375, + "grad_norm_var": 0.021214803059895832, + "learning_rate": 0.0001, + "loss": 7.2045, + "loss/crossentropy": 1.9638023972511292, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19546246528625488, + "step": 6614 + }, + { + "epoch": 0.4135, + "grad_norm": 2.125, + "grad_norm_var": 0.022069295247395832, + "learning_rate": 0.0001, + "loss": 7.2197, + "loss/crossentropy": 2.2863436937332153, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20454485714435577, + "step": 6616 + }, + { + "epoch": 0.413625, + "grad_norm": 2.15625, + "grad_norm_var": 0.025191243489583334, + "learning_rate": 0.0001, + "loss": 7.0682, + "loss/crossentropy": 2.2289178371429443, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2051631659269333, + "step": 6618 + }, + { + "epoch": 0.41375, + "grad_norm": 2.109375, + "grad_norm_var": 0.026325480143229166, + "learning_rate": 0.0001, + "loss": 7.246, + "loss/crossentropy": 2.1252601742744446, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21219032257795334, + "step": 6620 + }, + { + "epoch": 0.413875, + "grad_norm": 2.109375, + "grad_norm_var": 0.027228800455729167, + "learning_rate": 0.0001, + "loss": 7.1937, + "loss/crossentropy": 2.377307415008545, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22052890807390213, + "step": 6622 + }, + { + "epoch": 0.414, + "grad_norm": 2.109375, + "grad_norm_var": 0.028043619791666665, + "learning_rate": 0.0001, + "loss": 7.0238, + "loss/crossentropy": 2.070056438446045, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20407379418611526, + "step": 6624 + }, + { + "epoch": 0.414125, + "grad_norm": 2.09375, + "grad_norm_var": 0.027567545572916668, + "learning_rate": 0.0001, + "loss": 7.1428, + "loss/crossentropy": 2.3578120470046997, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2143327072262764, + "step": 6626 + }, + { + "epoch": 0.41425, + "grad_norm": 2.25, + "grad_norm_var": 0.026170857747395835, + "learning_rate": 0.0001, + "loss": 7.1836, + "loss/crossentropy": 2.4478999376296997, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21191944181919098, + "step": 6628 + }, + { + "epoch": 0.414375, + "grad_norm": 2.203125, + "grad_norm_var": 0.0091705322265625, + "learning_rate": 0.0001, + "loss": 7.2445, + "loss/crossentropy": 2.12015438079834, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.20925237238407135, + "step": 6630 + }, + { + "epoch": 0.4145, + "grad_norm": 2.265625, + "grad_norm_var": 0.009566243489583333, + "learning_rate": 0.0001, + "loss": 6.9971, + "loss/crossentropy": 2.209274411201477, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21350758522748947, + "step": 6632 + }, + { + "epoch": 0.414625, + "grad_norm": 2.234375, + "grad_norm_var": 0.009077962239583333, + "learning_rate": 0.0001, + "loss": 7.2467, + "loss/crossentropy": 2.0114998817443848, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2036866992712021, + "step": 6634 + }, + { + "epoch": 0.41475, + "grad_norm": 2.21875, + "grad_norm_var": 0.00732421875, + "learning_rate": 0.0001, + "loss": 7.3385, + "loss/crossentropy": 2.2817453145980835, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21267592161893845, + "step": 6636 + }, + { + "epoch": 0.414875, + "grad_norm": 2.0, + "grad_norm_var": 0.009300740559895833, + "learning_rate": 0.0001, + "loss": 7.3145, + "loss/crossentropy": 2.3444933891296387, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21684523671865463, + "step": 6638 + }, + { + "epoch": 0.415, + "grad_norm": 2.28125, + "grad_norm_var": 0.011150868733723958, + "learning_rate": 0.0001, + "loss": 7.2032, + "loss/crossentropy": 2.2932777404785156, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.24464474618434906, + "step": 6640 + }, + { + "epoch": 0.415125, + "grad_norm": 2.03125, + "grad_norm_var": 0.012143707275390625, + "learning_rate": 0.0001, + "loss": 7.1644, + "loss/crossentropy": 2.237114191055298, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2008899301290512, + "step": 6642 + }, + { + "epoch": 0.41525, + "grad_norm": 2.15625, + "grad_norm_var": 0.010117340087890624, + "learning_rate": 0.0001, + "loss": 7.2987, + "loss/crossentropy": 2.182630181312561, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1930861920118332, + "step": 6644 + }, + { + "epoch": 0.415375, + "grad_norm": 2.40625, + "grad_norm_var": 0.050872548421223955, + "learning_rate": 0.0001, + "loss": 7.2461, + "loss/crossentropy": 2.0044440031051636, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.18386952579021454, + "step": 6646 + }, + { + "epoch": 0.4155, + "grad_norm": 2.078125, + "grad_norm_var": 0.05223770141601562, + "learning_rate": 0.0001, + "loss": 7.1746, + "loss/crossentropy": 2.2447410821914673, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2052413374185562, + "step": 6648 + }, + { + "epoch": 0.415625, + "grad_norm": 2.15625, + "grad_norm_var": 0.053254954020182294, + "learning_rate": 0.0001, + "loss": 7.1224, + "loss/crossentropy": 1.9521759748458862, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21115896850824356, + "step": 6650 + }, + { + "epoch": 0.41575, + "grad_norm": 2.09375, + "grad_norm_var": 0.05448786417643229, + "learning_rate": 0.0001, + "loss": 7.347, + "loss/crossentropy": 2.5570948123931885, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.24073059856891632, + "step": 6652 + }, + { + "epoch": 0.415875, + "grad_norm": 2.125, + "grad_norm_var": 0.052978261311848955, + "learning_rate": 0.0001, + "loss": 7.1036, + "loss/crossentropy": 1.9738441109657288, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.17930065095424652, + "step": 6654 + }, + { + "epoch": 0.416, + "grad_norm": 2.21875, + "grad_norm_var": 0.05120035807291667, + "learning_rate": 0.0001, + "loss": 7.1908, + "loss/crossentropy": 2.2183395624160767, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19753951579332352, + "step": 6656 + }, + { + "epoch": 0.416125, + "grad_norm": 2.171875, + "grad_norm_var": 0.04938151041666667, + "learning_rate": 0.0001, + "loss": 7.0473, + "loss/crossentropy": 2.2473442554473877, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2003185674548149, + "step": 6658 + }, + { + "epoch": 0.41625, + "grad_norm": 2.015625, + "grad_norm_var": 0.05120035807291667, + "learning_rate": 0.0001, + "loss": 7.1982, + "loss/crossentropy": 2.0214603543281555, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2159905880689621, + "step": 6660 + }, + { + "epoch": 0.416375, + "grad_norm": 2.125, + "grad_norm_var": 0.011494700113932292, + "learning_rate": 0.0001, + "loss": 7.0941, + "loss/crossentropy": 2.1045217514038086, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21138650923967361, + "step": 6662 + }, + { + "epoch": 0.4165, + "grad_norm": 2.234375, + "grad_norm_var": 0.012595367431640626, + "learning_rate": 0.0001, + "loss": 7.1393, + "loss/crossentropy": 2.2603907585144043, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21229936927556992, + "step": 6664 + }, + { + "epoch": 0.416625, + "grad_norm": 2.21875, + "grad_norm_var": 0.011553700764973958, + "learning_rate": 0.0001, + "loss": 6.9486, + "loss/crossentropy": 2.166019320487976, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20004819333553314, + "step": 6666 + }, + { + "epoch": 0.41675, + "grad_norm": 2.109375, + "grad_norm_var": 0.011386871337890625, + "learning_rate": 0.0001, + "loss": 7.2752, + "loss/crossentropy": 2.4337663650512695, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21502867341041565, + "step": 6668 + }, + { + "epoch": 0.416875, + "grad_norm": 2.203125, + "grad_norm_var": 0.011307525634765624, + "learning_rate": 0.0001, + "loss": 7.197, + "loss/crossentropy": 2.383167862892151, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22213765233755112, + "step": 6670 + }, + { + "epoch": 0.417, + "grad_norm": 3.171875, + "grad_norm_var": 0.07418797810872396, + "learning_rate": 0.0001, + "loss": 7.2443, + "loss/crossentropy": 2.469019055366516, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22149549424648285, + "step": 6672 + }, + { + "epoch": 0.417125, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07734781901041667, + "learning_rate": 0.0001, + "loss": 7.1202, + "loss/crossentropy": 2.223373532295227, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20211607217788696, + "step": 6674 + }, + { + "epoch": 0.41725, + "grad_norm": 2.3125, + "grad_norm_var": 0.07432047526041667, + "learning_rate": 0.0001, + "loss": 7.1912, + "loss/crossentropy": 2.2044894695281982, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21385891735553741, + "step": 6676 + }, + { + "epoch": 0.417375, + "grad_norm": 2.0625, + "grad_norm_var": 0.07100601196289062, + "learning_rate": 0.0001, + "loss": 7.2926, + "loss/crossentropy": 2.291485071182251, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.22199992835521698, + "step": 6678 + }, + { + "epoch": 0.4175, + "grad_norm": 2.34375, + "grad_norm_var": 0.06825942993164062, + "learning_rate": 0.0001, + "loss": 7.2736, + "loss/crossentropy": 2.199909269809723, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21268168836832047, + "step": 6680 + }, + { + "epoch": 0.417625, + "grad_norm": 2.296875, + "grad_norm_var": 0.06856257120768229, + "learning_rate": 0.0001, + "loss": 7.255, + "loss/crossentropy": 2.4666231870651245, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22746828943490982, + "step": 6682 + }, + { + "epoch": 0.41775, + "grad_norm": 2.359375, + "grad_norm_var": 0.06691665649414062, + "learning_rate": 0.0001, + "loss": 7.3389, + "loss/crossentropy": 2.2254520654678345, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.19916392862796783, + "step": 6684 + }, + { + "epoch": 0.417875, + "grad_norm": 2.15625, + "grad_norm_var": 0.06772638956705729, + "learning_rate": 0.0001, + "loss": 7.3413, + "loss/crossentropy": 2.230736017227173, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2047268971800804, + "step": 6686 + }, + { + "epoch": 0.418, + "grad_norm": 2.109375, + "grad_norm_var": 0.011329905192057291, + "learning_rate": 0.0001, + "loss": 7.376, + "loss/crossentropy": 2.319286346435547, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21851042658090591, + "step": 6688 + }, + { + "epoch": 0.418125, + "grad_norm": 2.375, + "grad_norm_var": 0.011002604166666667, + "learning_rate": 0.0001, + "loss": 7.2427, + "loss/crossentropy": 2.312131881713867, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22473173588514328, + "step": 6690 + }, + { + "epoch": 0.41825, + "grad_norm": 2.125, + "grad_norm_var": 0.014322916666666666, + "learning_rate": 0.0001, + "loss": 7.2875, + "loss/crossentropy": 2.3510780334472656, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.21289631724357605, + "step": 6692 + }, + { + "epoch": 0.418375, + "grad_norm": 2.125, + "grad_norm_var": 0.014436848958333333, + "learning_rate": 0.0001, + "loss": 7.2495, + "loss/crossentropy": 2.302396059036255, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22023826837539673, + "step": 6694 + }, + { + "epoch": 0.4185, + "grad_norm": 2.171875, + "grad_norm_var": 0.0140533447265625, + "learning_rate": 0.0001, + "loss": 6.9816, + "loss/crossentropy": 2.4627773761749268, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2272256538271904, + "step": 6696 + }, + { + "epoch": 0.418625, + "grad_norm": 2.0625, + "grad_norm_var": 0.01793212890625, + "learning_rate": 0.0001, + "loss": 7.2295, + "loss/crossentropy": 2.464987277984619, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21357368677854538, + "step": 6698 + }, + { + "epoch": 0.41875, + "grad_norm": 2.453125, + "grad_norm_var": 0.0194000244140625, + "learning_rate": 0.0001, + "loss": 7.2894, + "loss/crossentropy": 2.3194926977157593, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21601901948451996, + "step": 6700 + }, + { + "epoch": 0.418875, + "grad_norm": 2.203125, + "grad_norm_var": 0.0193267822265625, + "learning_rate": 0.0001, + "loss": 7.1178, + "loss/crossentropy": 2.158350110054016, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21564631164073944, + "step": 6702 + }, + { + "epoch": 0.419, + "grad_norm": 2.1875, + "grad_norm_var": 0.018895467122395832, + "learning_rate": 0.0001, + "loss": 7.2231, + "loss/crossentropy": 2.300287127494812, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.1997418850660324, + "step": 6704 + }, + { + "epoch": 0.419125, + "grad_norm": 2.125, + "grad_norm_var": 0.013744099934895834, + "learning_rate": 0.0001, + "loss": 7.2349, + "loss/crossentropy": 2.378634810447693, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21635089814662933, + "step": 6706 + }, + { + "epoch": 0.41925, + "grad_norm": 2.15625, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 7.2868, + "loss/crossentropy": 2.3297730684280396, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2152877151966095, + "step": 6708 + }, + { + "epoch": 0.419375, + "grad_norm": 2.34375, + "grad_norm_var": 0.010838826497395834, + "learning_rate": 0.0001, + "loss": 7.3518, + "loss/crossentropy": 2.1663570404052734, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20017527043819427, + "step": 6710 + }, + { + "epoch": 0.4195, + "grad_norm": 2.015625, + "grad_norm_var": 0.013109334309895833, + "learning_rate": 0.0001, + "loss": 7.2772, + "loss/crossentropy": 2.153021812438965, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21046847850084305, + "step": 6712 + }, + { + "epoch": 0.419625, + "grad_norm": 2.140625, + "grad_norm_var": 0.011031087239583333, + "learning_rate": 0.0001, + "loss": 7.2319, + "loss/crossentropy": 2.362241506576538, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21138739585876465, + "step": 6714 + }, + { + "epoch": 0.41975, + "grad_norm": 2.125, + "grad_norm_var": 0.006669108072916667, + "learning_rate": 0.0001, + "loss": 7.2814, + "loss/crossentropy": 2.3483238220214844, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21662617474794388, + "step": 6716 + }, + { + "epoch": 0.419875, + "grad_norm": 2.390625, + "grad_norm_var": 0.009781901041666667, + "learning_rate": 0.0001, + "loss": 7.0603, + "loss/crossentropy": 2.3899463415145874, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20360279828310013, + "step": 6718 + }, + { + "epoch": 0.42, + "grad_norm": 2.125, + "grad_norm_var": 0.010868326822916666, + "learning_rate": 0.0001, + "loss": 7.1463, + "loss/crossentropy": 1.9754029512405396, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.17641819268465042, + "step": 6720 + }, + { + "epoch": 0.420125, + "grad_norm": 2.0625, + "grad_norm_var": 0.011359659830729167, + "learning_rate": 0.0001, + "loss": 7.3308, + "loss/crossentropy": 2.39111590385437, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21659883856773376, + "step": 6722 + }, + { + "epoch": 0.42025, + "grad_norm": 2.109375, + "grad_norm_var": 0.014902496337890625, + "learning_rate": 0.0001, + "loss": 7.0377, + "loss/crossentropy": 2.3440463542938232, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20923910290002823, + "step": 6724 + }, + { + "epoch": 0.420375, + "grad_norm": 2.140625, + "grad_norm_var": 0.012416330973307292, + "learning_rate": 0.0001, + "loss": 7.2015, + "loss/crossentropy": 2.3062779903411865, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2264929711818695, + "step": 6726 + }, + { + "epoch": 0.4205, + "grad_norm": 2.03125, + "grad_norm_var": 0.011045074462890625, + "learning_rate": 0.0001, + "loss": 7.2369, + "loss/crossentropy": 2.1152766346931458, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.20576580613851547, + "step": 6728 + }, + { + "epoch": 0.420625, + "grad_norm": 2.1875, + "grad_norm_var": 0.011270904541015625, + "learning_rate": 0.0001, + "loss": 7.2384, + "loss/crossentropy": 2.3229598999023438, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.2038460299372673, + "step": 6730 + }, + { + "epoch": 0.42075, + "grad_norm": 2.078125, + "grad_norm_var": 0.010459136962890626, + "learning_rate": 0.0001, + "loss": 7.1322, + "loss/crossentropy": 2.3162847757339478, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21878770738840103, + "step": 6732 + }, + { + "epoch": 0.420875, + "grad_norm": 2.40625, + "grad_norm_var": 0.013791656494140625, + "learning_rate": 0.0001, + "loss": 7.3428, + "loss/crossentropy": 2.2077457904815674, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19963285326957703, + "step": 6734 + }, + { + "epoch": 0.421, + "grad_norm": 1.984375, + "grad_norm_var": 0.014994049072265625, + "learning_rate": 0.0001, + "loss": 7.1156, + "loss/crossentropy": 2.162528872489929, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.201595239341259, + "step": 6736 + }, + { + "epoch": 0.421125, + "grad_norm": 2.0625, + "grad_norm_var": 0.015634918212890626, + "learning_rate": 0.0001, + "loss": 7.1029, + "loss/crossentropy": 1.9828236103057861, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19044172018766403, + "step": 6738 + }, + { + "epoch": 0.42125, + "grad_norm": 2.21875, + "grad_norm_var": 0.0149078369140625, + "learning_rate": 0.0001, + "loss": 7.2027, + "loss/crossentropy": 2.165103793144226, + "loss/hidden": 2.953125, + "loss/jsd": 0.0, + "loss/logits": 0.21535291522741318, + "step": 6740 + }, + { + "epoch": 0.421375, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01815973917643229, + "learning_rate": 0.0001, + "loss": 7.219, + "loss/crossentropy": 2.0478169918060303, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20827841013669968, + "step": 6742 + }, + { + "epoch": 0.4215, + "grad_norm": 2.125, + "grad_norm_var": 0.01746393839518229, + "learning_rate": 0.0001, + "loss": 7.1333, + "loss/crossentropy": 2.147496223449707, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21103401482105255, + "step": 6744 + }, + { + "epoch": 0.421625, + "grad_norm": 2.140625, + "grad_norm_var": 0.01730524698893229, + "learning_rate": 0.0001, + "loss": 7.0481, + "loss/crossentropy": 2.247206687927246, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2090415209531784, + "step": 6746 + }, + { + "epoch": 0.42175, + "grad_norm": 2.0, + "grad_norm_var": 0.0195953369140625, + "learning_rate": 0.0001, + "loss": 6.993, + "loss/crossentropy": 2.2320778369903564, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20059379935264587, + "step": 6748 + }, + { + "epoch": 0.421875, + "grad_norm": 2.234375, + "grad_norm_var": 0.01278076171875, + "learning_rate": 0.0001, + "loss": 7.2886, + "loss/crossentropy": 2.1443710327148438, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19475404173135757, + "step": 6750 + }, + { + "epoch": 0.422, + "grad_norm": 2.296875, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 7.1489, + "loss/crossentropy": 2.279394507408142, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20637677609920502, + "step": 6752 + }, + { + "epoch": 0.422125, + "grad_norm": 2.078125, + "grad_norm_var": 0.014835611979166666, + "learning_rate": 0.0001, + "loss": 7.0726, + "loss/crossentropy": 1.897760272026062, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2159346044063568, + "step": 6754 + }, + { + "epoch": 0.42225, + "grad_norm": 2.109375, + "grad_norm_var": 0.012621053059895833, + "learning_rate": 0.0001, + "loss": 7.1816, + "loss/crossentropy": 2.283962607383728, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.23135912418365479, + "step": 6756 + }, + { + "epoch": 0.422375, + "grad_norm": 2.140625, + "grad_norm_var": 0.009423573811848959, + "learning_rate": 0.0001, + "loss": 7.1644, + "loss/crossentropy": 2.388902187347412, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21581237763166428, + "step": 6758 + }, + { + "epoch": 0.4225, + "grad_norm": 2.0625, + "grad_norm_var": 0.010473378499348958, + "learning_rate": 0.0001, + "loss": 7.1085, + "loss/crossentropy": 2.092213809490204, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20688287913799286, + "step": 6760 + }, + { + "epoch": 0.422625, + "grad_norm": 2.28125, + "grad_norm_var": 0.011773427327473959, + "learning_rate": 0.0001, + "loss": 7.0663, + "loss/crossentropy": 2.2318451404571533, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.19159415364265442, + "step": 6762 + }, + { + "epoch": 0.42275, + "grad_norm": 2.265625, + "grad_norm_var": 0.009723917643229166, + "learning_rate": 0.0001, + "loss": 7.0906, + "loss/crossentropy": 2.011130690574646, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20954276621341705, + "step": 6764 + }, + { + "epoch": 0.422875, + "grad_norm": 2.078125, + "grad_norm_var": 0.0102935791015625, + "learning_rate": 0.0001, + "loss": 7.2399, + "loss/crossentropy": 2.238163113594055, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19764646142721176, + "step": 6766 + }, + { + "epoch": 0.423, + "grad_norm": 1.921875, + "grad_norm_var": 0.010285441080729167, + "learning_rate": 0.0001, + "loss": 7.0323, + "loss/crossentropy": 2.180557131767273, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19082503020763397, + "step": 6768 + }, + { + "epoch": 0.423125, + "grad_norm": 2.328125, + "grad_norm_var": 0.012702433268229167, + "learning_rate": 0.0001, + "loss": 7.197, + "loss/crossentropy": 2.274542212486267, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21664000302553177, + "step": 6770 + }, + { + "epoch": 0.42325, + "grad_norm": 2.046875, + "grad_norm_var": 0.027489217122395833, + "learning_rate": 0.0001, + "loss": 7.0367, + "loss/crossentropy": 2.271838068962097, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21325764805078506, + "step": 6772 + }, + { + "epoch": 0.423375, + "grad_norm": 2.171875, + "grad_norm_var": 0.030661773681640626, + "learning_rate": 0.0001, + "loss": 7.257, + "loss/crossentropy": 2.0293312668800354, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.19651466608047485, + "step": 6774 + }, + { + "epoch": 0.4235, + "grad_norm": 2.265625, + "grad_norm_var": 0.030142974853515626, + "learning_rate": 0.0001, + "loss": 7.1123, + "loss/crossentropy": 2.3418461084365845, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2213895097374916, + "step": 6776 + }, + { + "epoch": 0.423625, + "grad_norm": 2.234375, + "grad_norm_var": 0.030594635009765624, + "learning_rate": 0.0001, + "loss": 7.2606, + "loss/crossentropy": 2.3157602548599243, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.23133724927902222, + "step": 6778 + }, + { + "epoch": 0.42375, + "grad_norm": 2.015625, + "grad_norm_var": 0.031288401285807295, + "learning_rate": 0.0001, + "loss": 7.0713, + "loss/crossentropy": 2.2262462377548218, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21064256131649017, + "step": 6780 + }, + { + "epoch": 0.423875, + "grad_norm": 2.234375, + "grad_norm_var": 0.03070856730143229, + "learning_rate": 0.0001, + "loss": 7.2447, + "loss/crossentropy": 2.6724666357040405, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2289513200521469, + "step": 6782 + }, + { + "epoch": 0.424, + "grad_norm": 3.84375, + "grad_norm_var": 0.1964617411295573, + "learning_rate": 0.0001, + "loss": 7.2904, + "loss/crossentropy": 2.3239933252334595, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21807952970266342, + "step": 6784 + }, + { + "epoch": 0.424125, + "grad_norm": 2.484375, + "grad_norm_var": 0.20003433227539064, + "learning_rate": 0.0001, + "loss": 7.2351, + "loss/crossentropy": 2.0945045948028564, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.21246600896120071, + "step": 6786 + }, + { + "epoch": 0.42425, + "grad_norm": 2.140625, + "grad_norm_var": 0.19189224243164063, + "learning_rate": 0.0001, + "loss": 7.0303, + "loss/crossentropy": 2.0538535118103027, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.19015134125947952, + "step": 6788 + }, + { + "epoch": 0.424375, + "grad_norm": 2.03125, + "grad_norm_var": 0.18770243326822916, + "learning_rate": 0.0001, + "loss": 7.107, + "loss/crossentropy": 2.1239798069000244, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2081196829676628, + "step": 6790 + }, + { + "epoch": 0.4245, + "grad_norm": 2.078125, + "grad_norm_var": 0.19275614420572917, + "learning_rate": 0.0001, + "loss": 7.1894, + "loss/crossentropy": 2.553703784942627, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.20920544862747192, + "step": 6792 + }, + { + "epoch": 0.424625, + "grad_norm": 2.5, + "grad_norm_var": 0.19491780598958333, + "learning_rate": 0.0001, + "loss": 7.2841, + "loss/crossentropy": 2.5824981927871704, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21979796141386032, + "step": 6794 + }, + { + "epoch": 0.42475, + "grad_norm": 2.046875, + "grad_norm_var": 0.19247945149739584, + "learning_rate": 0.0001, + "loss": 7.0355, + "loss/crossentropy": 2.185934007167816, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19819935411214828, + "step": 6796 + }, + { + "epoch": 0.424875, + "grad_norm": 2.21875, + "grad_norm_var": 0.18901265462239583, + "learning_rate": 0.0001, + "loss": 7.3111, + "loss/crossentropy": 2.1742522716522217, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.21659693121910095, + "step": 6798 + }, + { + "epoch": 0.425, + "grad_norm": 2.09375, + "grad_norm_var": 0.026676432291666666, + "learning_rate": 0.0001, + "loss": 7.2202, + "loss/crossentropy": 2.3170909881591797, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21167440712451935, + "step": 6800 + }, + { + "epoch": 0.425125, + "grad_norm": 2.15625, + "grad_norm_var": 0.015461222330729166, + "learning_rate": 0.0001, + "loss": 7.229, + "loss/crossentropy": 2.2571088075637817, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.21498262882232666, + "step": 6802 + }, + { + "epoch": 0.42525, + "grad_norm": 2.3125, + "grad_norm_var": 0.016535441080729168, + "learning_rate": 0.0001, + "loss": 7.127, + "loss/crossentropy": 2.3108266592025757, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21807894110679626, + "step": 6804 + }, + { + "epoch": 0.425375, + "grad_norm": 2.046875, + "grad_norm_var": 0.0168853759765625, + "learning_rate": 0.0001, + "loss": 7.2834, + "loss/crossentropy": 2.15952205657959, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19036553800106049, + "step": 6806 + }, + { + "epoch": 0.4255, + "grad_norm": 2.140625, + "grad_norm_var": 0.016356404622395834, + "learning_rate": 0.0001, + "loss": 7.2448, + "loss/crossentropy": 2.15840220451355, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20849886536598206, + "step": 6808 + }, + { + "epoch": 0.425625, + "grad_norm": 2.078125, + "grad_norm_var": 0.008039347330729167, + "learning_rate": 0.0001, + "loss": 7.2537, + "loss/crossentropy": 2.4403945207595825, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2224520891904831, + "step": 6810 + }, + { + "epoch": 0.42575, + "grad_norm": 2.15625, + "grad_norm_var": 0.0072743733723958336, + "learning_rate": 0.0001, + "loss": 7.1626, + "loss/crossentropy": 2.308092713356018, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20341620594263077, + "step": 6812 + }, + { + "epoch": 0.425875, + "grad_norm": 2.3125, + "grad_norm_var": 0.009015909830729167, + "learning_rate": 0.0001, + "loss": 7.0819, + "loss/crossentropy": 2.172079086303711, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21274089068174362, + "step": 6814 + }, + { + "epoch": 0.426, + "grad_norm": 2.140625, + "grad_norm_var": 0.012919108072916666, + "learning_rate": 0.0001, + "loss": 7.3531, + "loss/crossentropy": 2.1660048365592957, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21018584817647934, + "step": 6816 + }, + { + "epoch": 0.426125, + "grad_norm": 2.1875, + "grad_norm_var": 0.013460286458333333, + "learning_rate": 0.0001, + "loss": 7.1127, + "loss/crossentropy": 2.2449586391448975, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21250271797180176, + "step": 6818 + }, + { + "epoch": 0.42625, + "grad_norm": 2.109375, + "grad_norm_var": 0.012626139322916667, + "learning_rate": 0.0001, + "loss": 7.1219, + "loss/crossentropy": 2.40755295753479, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2143237292766571, + "step": 6820 + }, + { + "epoch": 0.426375, + "grad_norm": 2.125, + "grad_norm_var": 0.0127349853515625, + "learning_rate": 0.0001, + "loss": 7.1712, + "loss/crossentropy": 2.077523171901703, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21024423092603683, + "step": 6822 + }, + { + "epoch": 0.4265, + "grad_norm": 2.140625, + "grad_norm_var": 0.01285400390625, + "learning_rate": 0.0001, + "loss": 7.1869, + "loss/crossentropy": 2.2102068662643433, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22490687668323517, + "step": 6824 + }, + { + "epoch": 0.426625, + "grad_norm": 2.21875, + "grad_norm_var": 0.0135162353515625, + "learning_rate": 0.0001, + "loss": 7.144, + "loss/crossentropy": 2.5356907844543457, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2128807231783867, + "step": 6826 + }, + { + "epoch": 0.42675, + "grad_norm": 1.984375, + "grad_norm_var": 0.016258748372395833, + "learning_rate": 0.0001, + "loss": 7.1871, + "loss/crossentropy": 2.4186887741088867, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2111327275633812, + "step": 6828 + }, + { + "epoch": 0.426875, + "grad_norm": 2.296875, + "grad_norm_var": 0.0157623291015625, + "learning_rate": 0.0001, + "loss": 7.2081, + "loss/crossentropy": 2.4064241647720337, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22412901371717453, + "step": 6830 + }, + { + "epoch": 0.427, + "grad_norm": 2.046875, + "grad_norm_var": 0.0109527587890625, + "learning_rate": 0.0001, + "loss": 7.1116, + "loss/crossentropy": 2.1347469091415405, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.1934138983488083, + "step": 6832 + }, + { + "epoch": 0.427125, + "grad_norm": 2.40625, + "grad_norm_var": 0.012376912434895833, + "learning_rate": 0.0001, + "loss": 7.0814, + "loss/crossentropy": 2.5839143991470337, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22045065462589264, + "step": 6834 + }, + { + "epoch": 0.42725, + "grad_norm": 1.96875, + "grad_norm_var": 0.014615885416666667, + "learning_rate": 0.0001, + "loss": 7.2174, + "loss/crossentropy": 2.4431090354919434, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2187180370092392, + "step": 6836 + }, + { + "epoch": 0.427375, + "grad_norm": 2.234375, + "grad_norm_var": 0.014191691080729167, + "learning_rate": 0.0001, + "loss": 7.2399, + "loss/crossentropy": 2.3245296478271484, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21662010997533798, + "step": 6838 + }, + { + "epoch": 0.4275, + "grad_norm": 1.96875, + "grad_norm_var": 0.016559855143229166, + "learning_rate": 0.0001, + "loss": 7.0713, + "loss/crossentropy": 2.178970217704773, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.21533022820949554, + "step": 6840 + }, + { + "epoch": 0.427625, + "grad_norm": 2.109375, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 7.0306, + "loss/crossentropy": 2.044509172439575, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19547070562839508, + "step": 6842 + }, + { + "epoch": 0.42775, + "grad_norm": 2.046875, + "grad_norm_var": 0.0145904541015625, + "learning_rate": 0.0001, + "loss": 7.1904, + "loss/crossentropy": 2.6852035522460938, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23019836843013763, + "step": 6844 + }, + { + "epoch": 0.427875, + "grad_norm": 2.140625, + "grad_norm_var": 0.011649576822916667, + "learning_rate": 0.0001, + "loss": 7.1795, + "loss/crossentropy": 2.404147505760193, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21275443583726883, + "step": 6846 + }, + { + "epoch": 0.428, + "grad_norm": 2.296875, + "grad_norm_var": 0.013118489583333334, + "learning_rate": 0.0001, + "loss": 7.1491, + "loss/crossentropy": 2.30082368850708, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20821545273065567, + "step": 6848 + }, + { + "epoch": 0.428125, + "grad_norm": 2.140625, + "grad_norm_var": 0.0087066650390625, + "learning_rate": 0.0001, + "loss": 7.243, + "loss/crossentropy": 2.41938316822052, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2598809227347374, + "step": 6850 + }, + { + "epoch": 0.42825, + "grad_norm": 2.1875, + "grad_norm_var": 0.007552083333333333, + "learning_rate": 0.0001, + "loss": 7.1086, + "loss/crossentropy": 2.22542405128479, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2023932784795761, + "step": 6852 + }, + { + "epoch": 0.428375, + "grad_norm": 2.1875, + "grad_norm_var": 0.010448201497395834, + "learning_rate": 0.0001, + "loss": 7.2592, + "loss/crossentropy": 2.057928144931793, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.1930096223950386, + "step": 6854 + }, + { + "epoch": 0.4285, + "grad_norm": 2.0, + "grad_norm_var": 0.010676066080729166, + "learning_rate": 0.0001, + "loss": 7.1224, + "loss/crossentropy": 2.1819591522216797, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1996973529458046, + "step": 6856 + }, + { + "epoch": 0.428625, + "grad_norm": 2.109375, + "grad_norm_var": 0.010773722330729167, + "learning_rate": 0.0001, + "loss": 7.1747, + "loss/crossentropy": 2.0980719327926636, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.22104358673095703, + "step": 6858 + }, + { + "epoch": 0.42875, + "grad_norm": 2.15625, + "grad_norm_var": 0.016243489583333333, + "learning_rate": 0.0001, + "loss": 6.913, + "loss/crossentropy": 2.1513818502426147, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21680350601673126, + "step": 6860 + }, + { + "epoch": 0.428875, + "grad_norm": 2.59375, + "grad_norm_var": 0.0395904541015625, + "learning_rate": 0.0001, + "loss": 7.2397, + "loss/crossentropy": 2.2294562458992004, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22388208657503128, + "step": 6862 + }, + { + "epoch": 0.429, + "grad_norm": 2.171875, + "grad_norm_var": 0.03950093587239583, + "learning_rate": 0.0001, + "loss": 7.0361, + "loss/crossentropy": 2.283179521560669, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21150479465723038, + "step": 6864 + }, + { + "epoch": 0.429125, + "grad_norm": 2.125, + "grad_norm_var": 0.03980712890625, + "learning_rate": 0.0001, + "loss": 7.1224, + "loss/crossentropy": 2.3938897848129272, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2365514636039734, + "step": 6866 + }, + { + "epoch": 0.42925, + "grad_norm": 2.328125, + "grad_norm_var": 0.0417877197265625, + "learning_rate": 0.0001, + "loss": 7.1785, + "loss/crossentropy": 2.1658443212509155, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19529584795236588, + "step": 6868 + }, + { + "epoch": 0.429375, + "grad_norm": 2.046875, + "grad_norm_var": 0.04097900390625, + "learning_rate": 0.0001, + "loss": 7.1715, + "loss/crossentropy": 2.0922370553016663, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.19775715470314026, + "step": 6870 + }, + { + "epoch": 0.4295, + "grad_norm": 2.03125, + "grad_norm_var": 0.03996480305989583, + "learning_rate": 0.0001, + "loss": 7.0673, + "loss/crossentropy": 1.8662742376327515, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2048928141593933, + "step": 6872 + }, + { + "epoch": 0.429625, + "grad_norm": 2.15625, + "grad_norm_var": 0.039990234375, + "learning_rate": 0.0001, + "loss": 7.2334, + "loss/crossentropy": 2.25331974029541, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2132648006081581, + "step": 6874 + }, + { + "epoch": 0.42975, + "grad_norm": 2.171875, + "grad_norm_var": 0.03132222493489583, + "learning_rate": 0.0001, + "loss": 7.0384, + "loss/crossentropy": 2.0938963890075684, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19403553754091263, + "step": 6876 + }, + { + "epoch": 0.429875, + "grad_norm": 2.171875, + "grad_norm_var": 0.0096588134765625, + "learning_rate": 0.0001, + "loss": 7.1117, + "loss/crossentropy": 2.201113998889923, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2137242630124092, + "step": 6878 + }, + { + "epoch": 0.43, + "grad_norm": 2.0625, + "grad_norm_var": 0.009700520833333334, + "learning_rate": 0.0001, + "loss": 7.1669, + "loss/crossentropy": 2.0272424817085266, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.19501972198486328, + "step": 6880 + }, + { + "epoch": 0.430125, + "grad_norm": 2.140625, + "grad_norm_var": 0.009423828125, + "learning_rate": 0.0001, + "loss": 7.2121, + "loss/crossentropy": 2.506343126296997, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.22914932668209076, + "step": 6882 + }, + { + "epoch": 0.43025, + "grad_norm": 2.109375, + "grad_norm_var": 0.004011027018229167, + "learning_rate": 0.0001, + "loss": 7.2302, + "loss/crossentropy": 2.3738802671432495, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21538487076759338, + "step": 6884 + }, + { + "epoch": 0.430375, + "grad_norm": 2.125, + "grad_norm_var": 0.0032063802083333332, + "learning_rate": 0.0001, + "loss": 7.0062, + "loss/crossentropy": 2.2716506719589233, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20168288797140121, + "step": 6886 + }, + { + "epoch": 0.4305, + "grad_norm": 2.046875, + "grad_norm_var": 0.0031412760416666668, + "learning_rate": 0.0001, + "loss": 7.0321, + "loss/crossentropy": 2.131265103816986, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19480124861001968, + "step": 6888 + }, + { + "epoch": 0.430625, + "grad_norm": 2.140625, + "grad_norm_var": 0.005765533447265625, + "learning_rate": 0.0001, + "loss": 7.0555, + "loss/crossentropy": 2.56924045085907, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21264956146478653, + "step": 6890 + }, + { + "epoch": 0.43075, + "grad_norm": 2.15625, + "grad_norm_var": 0.005863189697265625, + "learning_rate": 0.0001, + "loss": 7.29, + "loss/crossentropy": 2.1670228242874146, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2205064371228218, + "step": 6892 + }, + { + "epoch": 0.430875, + "grad_norm": 2.078125, + "grad_norm_var": 0.010558827718098959, + "learning_rate": 0.0001, + "loss": 7.0508, + "loss/crossentropy": 2.2021427154541016, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21743560582399368, + "step": 6894 + }, + { + "epoch": 0.431, + "grad_norm": 2.28125, + "grad_norm_var": 0.013138580322265624, + "learning_rate": 0.0001, + "loss": 7.1581, + "loss/crossentropy": 2.3325300216674805, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22179429978132248, + "step": 6896 + }, + { + "epoch": 0.431125, + "grad_norm": 2.140625, + "grad_norm_var": 0.013879140218098959, + "learning_rate": 0.0001, + "loss": 7.134, + "loss/crossentropy": 2.150698184967041, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21261726319789886, + "step": 6898 + }, + { + "epoch": 0.43125, + "grad_norm": 2.359375, + "grad_norm_var": 0.016928863525390626, + "learning_rate": 0.0001, + "loss": 7.0722, + "loss/crossentropy": 2.262513518333435, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2029724195599556, + "step": 6900 + }, + { + "epoch": 0.431375, + "grad_norm": 2.015625, + "grad_norm_var": 0.01788304646809896, + "learning_rate": 0.0001, + "loss": 7.0642, + "loss/crossentropy": 2.2124452590942383, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20887747406959534, + "step": 6902 + }, + { + "epoch": 0.4315, + "grad_norm": 2.3125, + "grad_norm_var": 0.018552398681640624, + "learning_rate": 0.0001, + "loss": 7.3876, + "loss/crossentropy": 2.4545419216156006, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.263492152094841, + "step": 6904 + }, + { + "epoch": 0.431625, + "grad_norm": 2.265625, + "grad_norm_var": 0.016487630208333333, + "learning_rate": 0.0001, + "loss": 7.1829, + "loss/crossentropy": 2.244446575641632, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20837052166461945, + "step": 6906 + }, + { + "epoch": 0.43175, + "grad_norm": 2.09375, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 7.1886, + "loss/crossentropy": 2.059956908226013, + "loss/hidden": 2.71875, + "loss/jsd": 0.0, + "loss/logits": 0.18521716445684433, + "step": 6908 + }, + { + "epoch": 0.431875, + "grad_norm": 2.296875, + "grad_norm_var": 0.010380045572916666, + "learning_rate": 0.0001, + "loss": 7.0432, + "loss/crossentropy": 2.323797821998596, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20761998742818832, + "step": 6910 + }, + { + "epoch": 0.432, + "grad_norm": 2.09375, + "grad_norm_var": 0.010236612955729167, + "learning_rate": 0.0001, + "loss": 7.216, + "loss/crossentropy": 2.1744368076324463, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.1985497921705246, + "step": 6912 + }, + { + "epoch": 0.432125, + "grad_norm": 2.15625, + "grad_norm_var": 0.009935506184895833, + "learning_rate": 0.0001, + "loss": 7.0865, + "loss/crossentropy": 2.1344690322875977, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22802872955799103, + "step": 6914 + }, + { + "epoch": 0.43225, + "grad_norm": 2.09375, + "grad_norm_var": 0.007340494791666667, + "learning_rate": 0.0001, + "loss": 7.1136, + "loss/crossentropy": 2.186421036720276, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19479546695947647, + "step": 6916 + }, + { + "epoch": 0.432375, + "grad_norm": 2.203125, + "grad_norm_var": 0.007942708333333333, + "learning_rate": 0.0001, + "loss": 7.1482, + "loss/crossentropy": 1.894170880317688, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.18849200010299683, + "step": 6918 + }, + { + "epoch": 0.4325, + "grad_norm": 2.375, + "grad_norm_var": 0.0094146728515625, + "learning_rate": 0.0001, + "loss": 7.3064, + "loss/crossentropy": 2.443792700767517, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22042584419250488, + "step": 6920 + }, + { + "epoch": 0.432625, + "grad_norm": 2.046875, + "grad_norm_var": 0.012040201822916667, + "learning_rate": 0.0001, + "loss": 7.1741, + "loss/crossentropy": 2.199779987335205, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21045687794685364, + "step": 6922 + }, + { + "epoch": 0.43275, + "grad_norm": 2.09375, + "grad_norm_var": 0.012369791666666666, + "learning_rate": 0.0001, + "loss": 7.0816, + "loss/crossentropy": 1.9734878540039062, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.18932150304317474, + "step": 6924 + }, + { + "epoch": 0.432875, + "grad_norm": 2.015625, + "grad_norm_var": 0.01383056640625, + "learning_rate": 0.0001, + "loss": 7.0272, + "loss/crossentropy": 2.060529947280884, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20331456512212753, + "step": 6926 + }, + { + "epoch": 0.433, + "grad_norm": 2.109375, + "grad_norm_var": 0.013232421875, + "learning_rate": 0.0001, + "loss": 7.1574, + "loss/crossentropy": 2.1172688007354736, + "loss/hidden": 2.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2042495608329773, + "step": 6928 + }, + { + "epoch": 0.433125, + "grad_norm": 2.09375, + "grad_norm_var": 0.013524373372395834, + "learning_rate": 0.0001, + "loss": 7.2431, + "loss/crossentropy": 2.218605160713196, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2152126282453537, + "step": 6930 + }, + { + "epoch": 0.43325, + "grad_norm": 2.03125, + "grad_norm_var": 0.014020792643229167, + "learning_rate": 0.0001, + "loss": 7.0866, + "loss/crossentropy": 2.047509551048279, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19541727006435394, + "step": 6932 + }, + { + "epoch": 0.433375, + "grad_norm": 2.046875, + "grad_norm_var": 0.014922841389973959, + "learning_rate": 0.0001, + "loss": 7.0279, + "loss/crossentropy": 2.1803908348083496, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.18999575078487396, + "step": 6934 + }, + { + "epoch": 0.4335, + "grad_norm": 2.171875, + "grad_norm_var": 0.009893544514973958, + "learning_rate": 0.0001, + "loss": 7.2116, + "loss/crossentropy": 2.3334230184555054, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2261931300163269, + "step": 6936 + }, + { + "epoch": 0.433625, + "grad_norm": 2.234375, + "grad_norm_var": 0.008129628499348958, + "learning_rate": 0.0001, + "loss": 7.1294, + "loss/crossentropy": 2.2865262031555176, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21169719099998474, + "step": 6938 + }, + { + "epoch": 0.43375, + "grad_norm": 2.046875, + "grad_norm_var": 0.008837636311848958, + "learning_rate": 0.0001, + "loss": 6.9919, + "loss/crossentropy": 2.046865701675415, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19481410086154938, + "step": 6940 + }, + { + "epoch": 0.433875, + "grad_norm": 2.25, + "grad_norm_var": 0.009787750244140626, + "learning_rate": 0.0001, + "loss": 7.2462, + "loss/crossentropy": 2.44638729095459, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22355614602565765, + "step": 6942 + }, + { + "epoch": 0.434, + "grad_norm": 2.0, + "grad_norm_var": 0.010550689697265626, + "learning_rate": 0.0001, + "loss": 7.078, + "loss/crossentropy": 2.1784814596176147, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20400924235582352, + "step": 6944 + }, + { + "epoch": 0.434125, + "grad_norm": 2.171875, + "grad_norm_var": 0.011446126302083333, + "learning_rate": 0.0001, + "loss": 7.0531, + "loss/crossentropy": 2.4585882425308228, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22780652344226837, + "step": 6946 + }, + { + "epoch": 0.43425, + "grad_norm": 2.234375, + "grad_norm_var": 0.01248779296875, + "learning_rate": 0.0001, + "loss": 7.1479, + "loss/crossentropy": 2.1119790077209473, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.199091836810112, + "step": 6948 + }, + { + "epoch": 0.434375, + "grad_norm": 2.25, + "grad_norm_var": 0.009987131754557291, + "learning_rate": 0.0001, + "loss": 7.1265, + "loss/crossentropy": 2.2907246351242065, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2269321158528328, + "step": 6950 + }, + { + "epoch": 0.4345, + "grad_norm": 1.921875, + "grad_norm_var": 0.013183339436848959, + "learning_rate": 0.0001, + "loss": 7.0487, + "loss/crossentropy": 2.2639458179473877, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19958817213773727, + "step": 6952 + }, + { + "epoch": 0.434625, + "grad_norm": 2.21875, + "grad_norm_var": 0.013014475504557291, + "learning_rate": 0.0001, + "loss": 7.0662, + "loss/crossentropy": 2.39899480342865, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21808726340532303, + "step": 6954 + }, + { + "epoch": 0.43475, + "grad_norm": 2.046875, + "grad_norm_var": 0.015541330973307291, + "learning_rate": 0.0001, + "loss": 7.1265, + "loss/crossentropy": 2.3878051042556763, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21249045431613922, + "step": 6956 + }, + { + "epoch": 0.434875, + "grad_norm": 2.203125, + "grad_norm_var": 0.014817047119140624, + "learning_rate": 0.0001, + "loss": 7.0378, + "loss/crossentropy": 2.0612010955810547, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2078692466020584, + "step": 6958 + }, + { + "epoch": 0.435, + "grad_norm": 2.0625, + "grad_norm_var": 0.014743804931640625, + "learning_rate": 0.0001, + "loss": 7.1189, + "loss/crossentropy": 2.1870445013046265, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20125596970319748, + "step": 6960 + }, + { + "epoch": 0.435125, + "grad_norm": 2.546875, + "grad_norm_var": 0.0228424072265625, + "learning_rate": 0.0001, + "loss": 7.1462, + "loss/crossentropy": 2.318724036216736, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20482215285301208, + "step": 6962 + }, + { + "epoch": 0.43525, + "grad_norm": 2.046875, + "grad_norm_var": 0.0268951416015625, + "learning_rate": 0.0001, + "loss": 7.1296, + "loss/crossentropy": 2.5011104345321655, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.22151333093643188, + "step": 6964 + }, + { + "epoch": 0.435375, + "grad_norm": 2.109375, + "grad_norm_var": 0.026887003580729166, + "learning_rate": 0.0001, + "loss": 7.1403, + "loss/crossentropy": 2.384117841720581, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21883900463581085, + "step": 6966 + }, + { + "epoch": 0.4355, + "grad_norm": 2.3125, + "grad_norm_var": 0.023705037434895833, + "learning_rate": 0.0001, + "loss": 7.0673, + "loss/crossentropy": 2.29778790473938, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21907077729701996, + "step": 6968 + }, + { + "epoch": 0.435625, + "grad_norm": 2.0625, + "grad_norm_var": 0.021540323893229168, + "learning_rate": 0.0001, + "loss": 7.1069, + "loss/crossentropy": 2.1342278718948364, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.18501774966716766, + "step": 6970 + }, + { + "epoch": 0.43575, + "grad_norm": 2.265625, + "grad_norm_var": 0.022297159830729166, + "learning_rate": 0.0001, + "loss": 7.2041, + "loss/crossentropy": 2.4118359088897705, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.1985846757888794, + "step": 6972 + }, + { + "epoch": 0.435875, + "grad_norm": 2.125, + "grad_norm_var": 0.022435506184895832, + "learning_rate": 0.0001, + "loss": 7.1099, + "loss/crossentropy": 2.274700164794922, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.1986750066280365, + "step": 6974 + }, + { + "epoch": 0.436, + "grad_norm": 2.078125, + "grad_norm_var": 0.022736612955729166, + "learning_rate": 0.0001, + "loss": 7.0998, + "loss/crossentropy": 2.030650019645691, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20323613286018372, + "step": 6976 + }, + { + "epoch": 0.436125, + "grad_norm": 1.984375, + "grad_norm_var": 0.017096964518229167, + "learning_rate": 0.0001, + "loss": 7.0212, + "loss/crossentropy": 2.1681824922561646, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19574403017759323, + "step": 6978 + }, + { + "epoch": 0.43625, + "grad_norm": 2.21875, + "grad_norm_var": 0.012287394205729166, + "learning_rate": 0.0001, + "loss": 7.3248, + "loss/crossentropy": 2.2973861694335938, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2214999794960022, + "step": 6980 + }, + { + "epoch": 0.436375, + "grad_norm": 2.125, + "grad_norm_var": 0.014435831705729167, + "learning_rate": 0.0001, + "loss": 7.0755, + "loss/crossentropy": 2.2429606914520264, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.21233399212360382, + "step": 6982 + }, + { + "epoch": 0.4365, + "grad_norm": 2.078125, + "grad_norm_var": 0.013199869791666667, + "learning_rate": 0.0001, + "loss": 7.0234, + "loss/crossentropy": 2.0870251655578613, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.18894104659557343, + "step": 6984 + }, + { + "epoch": 0.436625, + "grad_norm": 2.390625, + "grad_norm_var": 0.024022420247395832, + "learning_rate": 0.0001, + "loss": 7.1037, + "loss/crossentropy": 1.8943498730659485, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.19573025405406952, + "step": 6986 + }, + { + "epoch": 0.43675, + "grad_norm": 1.953125, + "grad_norm_var": 0.023616536458333334, + "learning_rate": 0.0001, + "loss": 7.2237, + "loss/crossentropy": 2.5429954528808594, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.23790296912193298, + "step": 6988 + }, + { + "epoch": 0.436875, + "grad_norm": 2.296875, + "grad_norm_var": 0.024690755208333335, + "learning_rate": 0.0001, + "loss": 7.1511, + "loss/crossentropy": 2.0892491340637207, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.19531705975532532, + "step": 6990 + }, + { + "epoch": 0.437, + "grad_norm": 2.015625, + "grad_norm_var": 0.024828084309895835, + "learning_rate": 0.0001, + "loss": 7.2602, + "loss/crossentropy": 2.1488635540008545, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21692749112844467, + "step": 6992 + }, + { + "epoch": 0.437125, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02623265584309896, + "learning_rate": 0.0001, + "loss": 7.1101, + "loss/crossentropy": 2.3074164390563965, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20840033143758774, + "step": 6994 + }, + { + "epoch": 0.43725, + "grad_norm": 2.21875, + "grad_norm_var": 0.026318105061848958, + "learning_rate": 0.0001, + "loss": 7.0916, + "loss/crossentropy": 2.2144097089767456, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21698354184627533, + "step": 6996 + }, + { + "epoch": 0.437375, + "grad_norm": 2.03125, + "grad_norm_var": 0.02490208943684896, + "learning_rate": 0.0001, + "loss": 7.2198, + "loss/crossentropy": 2.2563615441322327, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.1933683454990387, + "step": 6998 + }, + { + "epoch": 0.4375, + "grad_norm": 2.390625, + "grad_norm_var": 0.028254954020182292, + "learning_rate": 0.0001, + "loss": 7.2343, + "loss/crossentropy": 2.388770818710327, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21556702256202698, + "step": 7000 + }, + { + "epoch": 0.437625, + "grad_norm": 1.8515625, + "grad_norm_var": 0.022370402018229166, + "learning_rate": 0.0001, + "loss": 7.0739, + "loss/crossentropy": 2.278029680252075, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20425931364297867, + "step": 7002 + }, + { + "epoch": 0.43775, + "grad_norm": 2.09375, + "grad_norm_var": 0.020137532552083334, + "learning_rate": 0.0001, + "loss": 7.0929, + "loss/crossentropy": 2.2358744144439697, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.196475088596344, + "step": 7004 + }, + { + "epoch": 0.437875, + "grad_norm": 2.28125, + "grad_norm_var": 0.0196685791015625, + "learning_rate": 0.0001, + "loss": 7.0986, + "loss/crossentropy": 2.077066421508789, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.18660877645015717, + "step": 7006 + }, + { + "epoch": 0.438, + "grad_norm": 2.4375, + "grad_norm_var": 0.024275716145833334, + "learning_rate": 0.0001, + "loss": 7.1155, + "loss/crossentropy": 2.2696588039398193, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2037624940276146, + "step": 7008 + }, + { + "epoch": 0.438125, + "grad_norm": 2.015625, + "grad_norm_var": 0.02268040974934896, + "learning_rate": 0.0001, + "loss": 7.2975, + "loss/crossentropy": 2.3751505613327026, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21261807531118393, + "step": 7010 + }, + { + "epoch": 0.43825, + "grad_norm": 2.203125, + "grad_norm_var": 0.022517649332682292, + "learning_rate": 0.0001, + "loss": 7.2177, + "loss/crossentropy": 2.265856981277466, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.1969621181488037, + "step": 7012 + }, + { + "epoch": 0.438375, + "grad_norm": 2.15625, + "grad_norm_var": 0.021996815999348957, + "learning_rate": 0.0001, + "loss": 7.0688, + "loss/crossentropy": 2.275804281234741, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21681126207113266, + "step": 7014 + }, + { + "epoch": 0.4385, + "grad_norm": 2.125, + "grad_norm_var": 0.018047841389973958, + "learning_rate": 0.0001, + "loss": 7.1952, + "loss/crossentropy": 2.355897307395935, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20308750867843628, + "step": 7016 + }, + { + "epoch": 0.438625, + "grad_norm": 2.203125, + "grad_norm_var": 0.011701456705729167, + "learning_rate": 0.0001, + "loss": 7.1911, + "loss/crossentropy": 2.5430479049682617, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21198173612356186, + "step": 7018 + }, + { + "epoch": 0.43875, + "grad_norm": 2.15625, + "grad_norm_var": 0.010895792643229167, + "learning_rate": 0.0001, + "loss": 7.2542, + "loss/crossentropy": 2.3191837072372437, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.21172154694795609, + "step": 7020 + }, + { + "epoch": 0.438875, + "grad_norm": 2.171875, + "grad_norm_var": 0.0104400634765625, + "learning_rate": 0.0001, + "loss": 7.164, + "loss/crossentropy": 2.1724308729171753, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.21450695395469666, + "step": 7022 + }, + { + "epoch": 0.439, + "grad_norm": 2.0625, + "grad_norm_var": 0.00592041015625, + "learning_rate": 0.0001, + "loss": 7.1836, + "loss/crossentropy": 2.2622636556625366, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22127537429332733, + "step": 7024 + }, + { + "epoch": 0.439125, + "grad_norm": 2.0625, + "grad_norm_var": 0.003855133056640625, + "learning_rate": 0.0001, + "loss": 7.0389, + "loss/crossentropy": 2.3678966760635376, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19926925003528595, + "step": 7026 + }, + { + "epoch": 0.43925, + "grad_norm": 2.328125, + "grad_norm_var": 0.006176503499348959, + "learning_rate": 0.0001, + "loss": 7.2079, + "loss/crossentropy": 2.137107729911804, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2178208827972412, + "step": 7028 + }, + { + "epoch": 0.439375, + "grad_norm": 1.953125, + "grad_norm_var": 0.008664703369140625, + "learning_rate": 0.0001, + "loss": 7.1307, + "loss/crossentropy": 2.3075451850891113, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21288493275642395, + "step": 7030 + }, + { + "epoch": 0.4395, + "grad_norm": 2.296875, + "grad_norm_var": 0.010509999593098958, + "learning_rate": 0.0001, + "loss": 7.1834, + "loss/crossentropy": 2.046548902988434, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.18741850554943085, + "step": 7032 + }, + { + "epoch": 0.439625, + "grad_norm": 2.1875, + "grad_norm_var": 0.010379791259765625, + "learning_rate": 0.0001, + "loss": 7.1691, + "loss/crossentropy": 2.286561965942383, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20791344344615936, + "step": 7034 + }, + { + "epoch": 0.43975, + "grad_norm": 2.234375, + "grad_norm_var": 0.010599517822265625, + "learning_rate": 0.0001, + "loss": 7.3184, + "loss/crossentropy": 2.287408947944641, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21978344023227692, + "step": 7036 + }, + { + "epoch": 0.439875, + "grad_norm": 2.25, + "grad_norm_var": 0.010916900634765626, + "learning_rate": 0.0001, + "loss": 7.2935, + "loss/crossentropy": 2.6174110174179077, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20811716467142105, + "step": 7038 + }, + { + "epoch": 0.44, + "grad_norm": 2.171875, + "grad_norm_var": 0.010231272379557291, + "learning_rate": 0.0001, + "loss": 7.3164, + "loss/crossentropy": 2.4823784828186035, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.23784971982240677, + "step": 7040 + }, + { + "epoch": 0.440125, + "grad_norm": 2.078125, + "grad_norm_var": 0.007958984375, + "learning_rate": 0.0001, + "loss": 7.1676, + "loss/crossentropy": 2.3906302452087402, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.201579749584198, + "step": 7042 + }, + { + "epoch": 0.44025, + "grad_norm": 1.984375, + "grad_norm_var": 0.01168212890625, + "learning_rate": 0.0001, + "loss": 6.9151, + "loss/crossentropy": 2.106368660926819, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.17003612965345383, + "step": 7044 + }, + { + "epoch": 0.440375, + "grad_norm": 2.578125, + "grad_norm_var": 0.0222076416015625, + "learning_rate": 0.0001, + "loss": 7.172, + "loss/crossentropy": 2.2014052867889404, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21450527757406235, + "step": 7046 + }, + { + "epoch": 0.4405, + "grad_norm": 2.328125, + "grad_norm_var": 0.023249308268229168, + "learning_rate": 0.0001, + "loss": 7.0817, + "loss/crossentropy": 2.2320194244384766, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2144247591495514, + "step": 7048 + }, + { + "epoch": 0.440625, + "grad_norm": 2.28125, + "grad_norm_var": 0.023615519205729168, + "learning_rate": 0.0001, + "loss": 7.1826, + "loss/crossentropy": 2.2494088411331177, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.1985793113708496, + "step": 7050 + }, + { + "epoch": 0.44075, + "grad_norm": 2.046875, + "grad_norm_var": 0.026253255208333333, + "learning_rate": 0.0001, + "loss": 7.1503, + "loss/crossentropy": 2.260366201400757, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2241573929786682, + "step": 7052 + }, + { + "epoch": 0.440875, + "grad_norm": 2.09375, + "grad_norm_var": 0.026838175455729165, + "learning_rate": 0.0001, + "loss": 7.0095, + "loss/crossentropy": 1.7234525084495544, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.1722937896847725, + "step": 7054 + }, + { + "epoch": 0.441, + "grad_norm": 2.140625, + "grad_norm_var": 0.02974853515625, + "learning_rate": 0.0001, + "loss": 7.1572, + "loss/crossentropy": 2.1869869232177734, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.1890271008014679, + "step": 7056 + }, + { + "epoch": 0.441125, + "grad_norm": 2.171875, + "grad_norm_var": 0.0292633056640625, + "learning_rate": 0.0001, + "loss": 6.9738, + "loss/crossentropy": 2.192438244819641, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21589791029691696, + "step": 7058 + }, + { + "epoch": 0.44125, + "grad_norm": 2.140625, + "grad_norm_var": 0.02275390625, + "learning_rate": 0.0001, + "loss": 7.2958, + "loss/crossentropy": 2.5864726305007935, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23851511627435684, + "step": 7060 + }, + { + "epoch": 0.441375, + "grad_norm": 2.328125, + "grad_norm_var": 0.011546834309895834, + "learning_rate": 0.0001, + "loss": 7.2065, + "loss/crossentropy": 2.293688654899597, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21563249826431274, + "step": 7062 + }, + { + "epoch": 0.4415, + "grad_norm": 2.359375, + "grad_norm_var": 0.011442057291666667, + "learning_rate": 0.0001, + "loss": 7.1256, + "loss/crossentropy": 2.4867671728134155, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21973519027233124, + "step": 7064 + }, + { + "epoch": 0.441625, + "grad_norm": 2.03125, + "grad_norm_var": 0.010863240559895833, + "learning_rate": 0.0001, + "loss": 7.2803, + "loss/crossentropy": 2.2372056245803833, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21072745323181152, + "step": 7066 + }, + { + "epoch": 0.44175, + "grad_norm": 2.109375, + "grad_norm_var": 0.011714680989583334, + "learning_rate": 0.0001, + "loss": 7.0581, + "loss/crossentropy": 2.25705885887146, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21308179199695587, + "step": 7068 + }, + { + "epoch": 0.441875, + "grad_norm": 2.0625, + "grad_norm_var": 0.0117584228515625, + "learning_rate": 0.0001, + "loss": 7.0393, + "loss/crossentropy": 2.0396016240119934, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.1958983615040779, + "step": 7070 + }, + { + "epoch": 0.442, + "grad_norm": 1.984375, + "grad_norm_var": 0.012010701497395833, + "learning_rate": 0.0001, + "loss": 7.1502, + "loss/crossentropy": 2.4618523120880127, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21056653559207916, + "step": 7072 + }, + { + "epoch": 0.442125, + "grad_norm": 2.28125, + "grad_norm_var": 0.0133941650390625, + "learning_rate": 0.0001, + "loss": 7.2114, + "loss/crossentropy": 2.572341799736023, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.24748806655406952, + "step": 7074 + }, + { + "epoch": 0.44225, + "grad_norm": 2.0625, + "grad_norm_var": 0.0137603759765625, + "learning_rate": 0.0001, + "loss": 7.0992, + "loss/crossentropy": 2.293982744216919, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20247779786586761, + "step": 7076 + }, + { + "epoch": 0.442375, + "grad_norm": 2.25, + "grad_norm_var": 0.012809244791666667, + "learning_rate": 0.0001, + "loss": 7.0991, + "loss/crossentropy": 2.3547102212905884, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21820224821567535, + "step": 7078 + }, + { + "epoch": 0.4425, + "grad_norm": 2.09375, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 7.0733, + "loss/crossentropy": 2.233034372329712, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20219457149505615, + "step": 7080 + }, + { + "epoch": 0.442625, + "grad_norm": 2.203125, + "grad_norm_var": 0.009273274739583334, + "learning_rate": 0.0001, + "loss": 7.1626, + "loss/crossentropy": 2.2339953184127808, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19660235941410065, + "step": 7082 + }, + { + "epoch": 0.44275, + "grad_norm": 2.125, + "grad_norm_var": 0.008128865559895834, + "learning_rate": 0.0001, + "loss": 7.0955, + "loss/crossentropy": 2.2933523654937744, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20504215359687805, + "step": 7084 + }, + { + "epoch": 0.442875, + "grad_norm": 2.09375, + "grad_norm_var": 0.008138020833333334, + "learning_rate": 0.0001, + "loss": 7.1335, + "loss/crossentropy": 2.067702531814575, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.20143579691648483, + "step": 7086 + }, + { + "epoch": 0.443, + "grad_norm": 1.96875, + "grad_norm_var": 0.009570058186848958, + "learning_rate": 0.0001, + "loss": 7.1793, + "loss/crossentropy": 2.0924493074417114, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.19646278768777847, + "step": 7088 + }, + { + "epoch": 0.443125, + "grad_norm": 2.3125, + "grad_norm_var": 0.010235341389973958, + "learning_rate": 0.0001, + "loss": 7.0845, + "loss/crossentropy": 2.3074241876602173, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.20258978009223938, + "step": 7090 + }, + { + "epoch": 0.44325, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013842519124348958, + "learning_rate": 0.0001, + "loss": 7.065, + "loss/crossentropy": 2.421837568283081, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21192050725221634, + "step": 7092 + }, + { + "epoch": 0.443375, + "grad_norm": 2.40625, + "grad_norm_var": 0.017911529541015624, + "learning_rate": 0.0001, + "loss": 7.2564, + "loss/crossentropy": 2.22128963470459, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21447572112083435, + "step": 7094 + }, + { + "epoch": 0.4435, + "grad_norm": 2.015625, + "grad_norm_var": 0.019449869791666668, + "learning_rate": 0.0001, + "loss": 7.0811, + "loss/crossentropy": 1.8576909899711609, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.19812503457069397, + "step": 7096 + }, + { + "epoch": 0.443625, + "grad_norm": 2.109375, + "grad_norm_var": 0.01923828125, + "learning_rate": 0.0001, + "loss": 7.1299, + "loss/crossentropy": 2.3352116346359253, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.22965504229068756, + "step": 7098 + }, + { + "epoch": 0.44375, + "grad_norm": 2.21875, + "grad_norm_var": 0.021922810872395834, + "learning_rate": 0.0001, + "loss": 7.1568, + "loss/crossentropy": 2.288171410560608, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22362970560789108, + "step": 7100 + }, + { + "epoch": 0.443875, + "grad_norm": 2.0625, + "grad_norm_var": 0.022798665364583335, + "learning_rate": 0.0001, + "loss": 7.2212, + "loss/crossentropy": 2.535971522331238, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21269136667251587, + "step": 7102 + }, + { + "epoch": 0.444, + "grad_norm": 2.078125, + "grad_norm_var": 0.01968968709309896, + "learning_rate": 0.0001, + "loss": 7.1576, + "loss/crossentropy": 2.1160417795181274, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20902036875486374, + "step": 7104 + }, + { + "epoch": 0.444125, + "grad_norm": 2.078125, + "grad_norm_var": 0.01985448201497396, + "learning_rate": 0.0001, + "loss": 7.1795, + "loss/crossentropy": 2.206043004989624, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.20614183694124222, + "step": 7106 + }, + { + "epoch": 0.44425, + "grad_norm": 2.109375, + "grad_norm_var": 0.015641021728515624, + "learning_rate": 0.0001, + "loss": 7.0607, + "loss/crossentropy": 2.3254839181900024, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21963687986135483, + "step": 7108 + }, + { + "epoch": 0.444375, + "grad_norm": 2.15625, + "grad_norm_var": 0.010493723551432292, + "learning_rate": 0.0001, + "loss": 7.122, + "loss/crossentropy": 2.003136456012726, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20006847381591797, + "step": 7110 + }, + { + "epoch": 0.4445, + "grad_norm": 2.15625, + "grad_norm_var": 0.0087310791015625, + "learning_rate": 0.0001, + "loss": 7.242, + "loss/crossentropy": 1.9243924021720886, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20003072917461395, + "step": 7112 + }, + { + "epoch": 0.444625, + "grad_norm": 2.296875, + "grad_norm_var": 0.010383097330729167, + "learning_rate": 0.0001, + "loss": 7.0951, + "loss/crossentropy": 2.0053927302360535, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20178984850645065, + "step": 7114 + }, + { + "epoch": 0.44475, + "grad_norm": 2.203125, + "grad_norm_var": 0.0123931884765625, + "learning_rate": 0.0001, + "loss": 7.2092, + "loss/crossentropy": 2.1276594400405884, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19504177570343018, + "step": 7116 + }, + { + "epoch": 0.444875, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013303375244140625, + "learning_rate": 0.0001, + "loss": 7.1927, + "loss/crossentropy": 2.3429445028305054, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.22777916491031647, + "step": 7118 + }, + { + "epoch": 0.445, + "grad_norm": 2.546875, + "grad_norm_var": 0.022885894775390624, + "learning_rate": 0.0001, + "loss": 7.0434, + "loss/crossentropy": 1.9249637126922607, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.196383997797966, + "step": 7120 + }, + { + "epoch": 0.445125, + "grad_norm": 2.15625, + "grad_norm_var": 0.025658925374348957, + "learning_rate": 0.0001, + "loss": 7.0644, + "loss/crossentropy": 2.101871132850647, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19870474934577942, + "step": 7122 + }, + { + "epoch": 0.44525, + "grad_norm": 2.265625, + "grad_norm_var": 0.02504450480143229, + "learning_rate": 0.0001, + "loss": 7.1175, + "loss/crossentropy": 2.2008549571037292, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.19907881319522858, + "step": 7124 + }, + { + "epoch": 0.445375, + "grad_norm": 2.0625, + "grad_norm_var": 0.026838938395182293, + "learning_rate": 0.0001, + "loss": 7.2085, + "loss/crossentropy": 1.9554465413093567, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20027147978544235, + "step": 7126 + }, + { + "epoch": 0.4455, + "grad_norm": 2.203125, + "grad_norm_var": 0.026747385660807293, + "learning_rate": 0.0001, + "loss": 7.3277, + "loss/crossentropy": 2.2794995307922363, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21968071907758713, + "step": 7128 + }, + { + "epoch": 0.445625, + "grad_norm": 2.171875, + "grad_norm_var": 0.02599054972330729, + "learning_rate": 0.0001, + "loss": 7.1629, + "loss/crossentropy": 2.1739684343338013, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.20093033462762833, + "step": 7130 + }, + { + "epoch": 0.44575, + "grad_norm": 2.109375, + "grad_norm_var": 0.02295710245768229, + "learning_rate": 0.0001, + "loss": 7.137, + "loss/crossentropy": 2.3477087020874023, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2029757872223854, + "step": 7132 + }, + { + "epoch": 0.445875, + "grad_norm": 2.25, + "grad_norm_var": 0.019429524739583332, + "learning_rate": 0.0001, + "loss": 7.2484, + "loss/crossentropy": 2.5823616981506348, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22345758229494095, + "step": 7134 + }, + { + "epoch": 0.446, + "grad_norm": 2.125, + "grad_norm_var": 0.009859212239583333, + "learning_rate": 0.0001, + "loss": 7.0862, + "loss/crossentropy": 2.093753218650818, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20466412603855133, + "step": 7136 + }, + { + "epoch": 0.446125, + "grad_norm": 2.125, + "grad_norm_var": 0.004231770833333333, + "learning_rate": 0.0001, + "loss": 7.0262, + "loss/crossentropy": 2.1759002804756165, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22160179913043976, + "step": 7138 + }, + { + "epoch": 0.44625, + "grad_norm": 2.125, + "grad_norm_var": 0.007342274983723958, + "learning_rate": 0.0001, + "loss": 7.0974, + "loss/crossentropy": 2.324357032775879, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2030269131064415, + "step": 7140 + }, + { + "epoch": 0.446375, + "grad_norm": 2.25, + "grad_norm_var": 0.008015696207682292, + "learning_rate": 0.0001, + "loss": 7.2816, + "loss/crossentropy": 2.4500794410705566, + "loss/hidden": 2.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.260087326169014, + "step": 7142 + }, + { + "epoch": 0.4465, + "grad_norm": 2.046875, + "grad_norm_var": 0.008392079671223959, + "learning_rate": 0.0001, + "loss": 7.2604, + "loss/crossentropy": 2.3314210176467896, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.1982848197221756, + "step": 7144 + }, + { + "epoch": 0.446625, + "grad_norm": 2.109375, + "grad_norm_var": 0.008001454671223958, + "learning_rate": 0.0001, + "loss": 7.2175, + "loss/crossentropy": 2.3275386095046997, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20359043031930923, + "step": 7146 + }, + { + "epoch": 0.44675, + "grad_norm": 2.375, + "grad_norm_var": 0.011230214436848959, + "learning_rate": 0.0001, + "loss": 7.2505, + "loss/crossentropy": 2.1986998319625854, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2198983132839203, + "step": 7148 + }, + { + "epoch": 0.446875, + "grad_norm": 2.109375, + "grad_norm_var": 0.012115224202473959, + "learning_rate": 0.0001, + "loss": 7.2739, + "loss/crossentropy": 1.9064122438430786, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.1959618628025055, + "step": 7150 + }, + { + "epoch": 0.447, + "grad_norm": 2.109375, + "grad_norm_var": 0.013669586181640625, + "learning_rate": 0.0001, + "loss": 7.1101, + "loss/crossentropy": 2.3088358640670776, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.20778021216392517, + "step": 7152 + }, + { + "epoch": 0.447125, + "grad_norm": 2.171875, + "grad_norm_var": 0.014249420166015625, + "learning_rate": 0.0001, + "loss": 7.2034, + "loss/crossentropy": 2.2749520540237427, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.19885848462581635, + "step": 7154 + }, + { + "epoch": 0.44725, + "grad_norm": 1.984375, + "grad_norm_var": 0.011356608072916666, + "learning_rate": 0.0001, + "loss": 6.9199, + "loss/crossentropy": 2.0392338633537292, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.18347318470478058, + "step": 7156 + }, + { + "epoch": 0.447375, + "grad_norm": 2.203125, + "grad_norm_var": 0.012626139322916667, + "learning_rate": 0.0001, + "loss": 7.198, + "loss/crossentropy": 2.3574529886245728, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21006865054368973, + "step": 7158 + }, + { + "epoch": 0.4475, + "grad_norm": 2.375, + "grad_norm_var": 0.021922810872395834, + "learning_rate": 0.0001, + "loss": 7.1556, + "loss/crossentropy": 2.3607640266418457, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19356133043766022, + "step": 7160 + }, + { + "epoch": 0.447625, + "grad_norm": 2.09375, + "grad_norm_var": 0.023030598958333332, + "learning_rate": 0.0001, + "loss": 7.0782, + "loss/crossentropy": 2.203671097755432, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22066359221935272, + "step": 7162 + }, + { + "epoch": 0.44775, + "grad_norm": 2.09375, + "grad_norm_var": 0.022261555989583334, + "learning_rate": 0.0001, + "loss": 7.0562, + "loss/crossentropy": 2.1417211294174194, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.1960388571023941, + "step": 7164 + }, + { + "epoch": 0.447875, + "grad_norm": 2.09375, + "grad_norm_var": 0.021630859375, + "learning_rate": 0.0001, + "loss": 7.113, + "loss/crossentropy": 2.0206031799316406, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.18700659275054932, + "step": 7166 + }, + { + "epoch": 0.448, + "grad_norm": 2.109375, + "grad_norm_var": 0.02095947265625, + "learning_rate": 0.0001, + "loss": 7.094, + "loss/crossentropy": 2.1299338340759277, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.1982402428984642, + "step": 7168 + }, + { + "epoch": 0.448125, + "grad_norm": 2.203125, + "grad_norm_var": 0.028413899739583335, + "learning_rate": 0.0001, + "loss": 7.152, + "loss/crossentropy": 2.33224093914032, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21684490144252777, + "step": 7170 + }, + { + "epoch": 0.44825, + "grad_norm": 2.296875, + "grad_norm_var": 0.025016276041666667, + "learning_rate": 0.0001, + "loss": 7.2652, + "loss/crossentropy": 2.3137996196746826, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2174111008644104, + "step": 7172 + }, + { + "epoch": 0.448375, + "grad_norm": 2.1875, + "grad_norm_var": 0.024421183268229167, + "learning_rate": 0.0001, + "loss": 7.1668, + "loss/crossentropy": 1.9115217328071594, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21205194294452667, + "step": 7174 + }, + { + "epoch": 0.4485, + "grad_norm": 2.0, + "grad_norm_var": 0.018000284830729168, + "learning_rate": 0.0001, + "loss": 7.1978, + "loss/crossentropy": 2.4868533611297607, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22938791662454605, + "step": 7176 + }, + { + "epoch": 0.448625, + "grad_norm": 2.265625, + "grad_norm_var": 0.018260701497395834, + "learning_rate": 0.0001, + "loss": 7.193, + "loss/crossentropy": 2.2773613929748535, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20718929171562195, + "step": 7178 + }, + { + "epoch": 0.44875, + "grad_norm": 2.078125, + "grad_norm_var": 0.0171295166015625, + "learning_rate": 0.0001, + "loss": 7.1092, + "loss/crossentropy": 2.048713266849518, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.19681257009506226, + "step": 7180 + }, + { + "epoch": 0.448875, + "grad_norm": 2.3125, + "grad_norm_var": 0.015397135416666667, + "learning_rate": 0.0001, + "loss": 7.0797, + "loss/crossentropy": 1.8432785868644714, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20921127498149872, + "step": 7182 + }, + { + "epoch": 0.449, + "grad_norm": 2.140625, + "grad_norm_var": 0.015387980143229167, + "learning_rate": 0.0001, + "loss": 7.1391, + "loss/crossentropy": 2.129906117916107, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2244158536195755, + "step": 7184 + }, + { + "epoch": 0.449125, + "grad_norm": 2.21875, + "grad_norm_var": 0.011551920572916667, + "learning_rate": 0.0001, + "loss": 7.1485, + "loss/crossentropy": 2.1112728118896484, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20086340606212616, + "step": 7186 + }, + { + "epoch": 0.44925, + "grad_norm": 2.0625, + "grad_norm_var": 0.011714680989583334, + "learning_rate": 0.0001, + "loss": 7.2314, + "loss/crossentropy": 2.256954550743103, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20468229055404663, + "step": 7188 + }, + { + "epoch": 0.449375, + "grad_norm": 2.125, + "grad_norm_var": 0.014453125, + "learning_rate": 0.0001, + "loss": 7.1437, + "loss/crossentropy": 2.071455657482147, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.20885341614484787, + "step": 7190 + }, + { + "epoch": 0.4495, + "grad_norm": 2.046875, + "grad_norm_var": 0.013114420572916667, + "learning_rate": 0.0001, + "loss": 7.0949, + "loss/crossentropy": 2.562646746635437, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21791629493236542, + "step": 7192 + }, + { + "epoch": 0.449625, + "grad_norm": 2.140625, + "grad_norm_var": 0.009761555989583334, + "learning_rate": 0.0001, + "loss": 7.1798, + "loss/crossentropy": 2.3666142225265503, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2231055274605751, + "step": 7194 + }, + { + "epoch": 0.44975, + "grad_norm": 2.109375, + "grad_norm_var": 0.009300740559895833, + "learning_rate": 0.0001, + "loss": 7.2875, + "loss/crossentropy": 2.2392258644104004, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21026718616485596, + "step": 7196 + }, + { + "epoch": 0.449875, + "grad_norm": 2.015625, + "grad_norm_var": 0.0059234619140625, + "learning_rate": 0.0001, + "loss": 7.0306, + "loss/crossentropy": 2.531536340713501, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20711271464824677, + "step": 7198 + }, + { + "epoch": 0.45, + "grad_norm": 2.453125, + "grad_norm_var": 0.045699055989583334, + "learning_rate": 0.0001, + "loss": 7.1813, + "loss/crossentropy": 2.32056200504303, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21862630546092987, + "step": 7200 + }, + { + "epoch": 0.450125, + "grad_norm": 2.296875, + "grad_norm_var": 0.04778620402018229, + "learning_rate": 0.0001, + "loss": 7.3459, + "loss/crossentropy": 2.6083868741989136, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2005249261856079, + "step": 7202 + }, + { + "epoch": 0.45025, + "grad_norm": 2.171875, + "grad_norm_var": 0.046781158447265624, + "learning_rate": 0.0001, + "loss": 6.9953, + "loss/crossentropy": 2.134474039077759, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.18911154568195343, + "step": 7204 + }, + { + "epoch": 0.450375, + "grad_norm": 2.015625, + "grad_norm_var": 0.046537017822265624, + "learning_rate": 0.0001, + "loss": 7.2096, + "loss/crossentropy": 2.2817357778549194, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20316140353679657, + "step": 7206 + }, + { + "epoch": 0.4505, + "grad_norm": 2.03125, + "grad_norm_var": 0.049806467692057294, + "learning_rate": 0.0001, + "loss": 7.0024, + "loss/crossentropy": 2.288169503211975, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20653357356786728, + "step": 7208 + }, + { + "epoch": 0.450625, + "grad_norm": 2.25, + "grad_norm_var": 0.050172678629557294, + "learning_rate": 0.0001, + "loss": 7.2516, + "loss/crossentropy": 2.5531680583953857, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21465806663036346, + "step": 7210 + }, + { + "epoch": 0.45075, + "grad_norm": 2.1875, + "grad_norm_var": 0.04960912068684896, + "learning_rate": 0.0001, + "loss": 7.3268, + "loss/crossentropy": 2.2371240854263306, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21228843927383423, + "step": 7212 + }, + { + "epoch": 0.450875, + "grad_norm": 2.21875, + "grad_norm_var": 0.04690119425455729, + "learning_rate": 0.0001, + "loss": 7.2514, + "loss/crossentropy": 2.17119038105011, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.19861158728599548, + "step": 7214 + }, + { + "epoch": 0.451, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0108551025390625, + "learning_rate": 0.0001, + "loss": 6.9546, + "loss/crossentropy": 2.042115569114685, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.19691552966833115, + "step": 7216 + }, + { + "epoch": 0.451125, + "grad_norm": 2.3125, + "grad_norm_var": 0.009997304280598958, + "learning_rate": 0.0001, + "loss": 7.0743, + "loss/crossentropy": 2.0181705951690674, + "loss/hidden": 2.71875, + "loss/jsd": 0.0, + "loss/logits": 0.20092586427927017, + "step": 7218 + }, + { + "epoch": 0.45125, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011661783854166666, + "learning_rate": 0.0001, + "loss": 7.0426, + "loss/crossentropy": 2.313483238220215, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2026253342628479, + "step": 7220 + }, + { + "epoch": 0.451375, + "grad_norm": 2.3125, + "grad_norm_var": 0.014689127604166666, + "learning_rate": 0.0001, + "loss": 7.0506, + "loss/crossentropy": 2.278806447982788, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2161618396639824, + "step": 7222 + }, + { + "epoch": 0.4515, + "grad_norm": 2.140625, + "grad_norm_var": 0.011592610677083334, + "learning_rate": 0.0001, + "loss": 7.3406, + "loss/crossentropy": 2.26158607006073, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20861487835645676, + "step": 7224 + }, + { + "epoch": 0.451625, + "grad_norm": 2.203125, + "grad_norm_var": 0.011201985677083333, + "learning_rate": 0.0001, + "loss": 7.1772, + "loss/crossentropy": 2.0447089076042175, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2081030160188675, + "step": 7226 + }, + { + "epoch": 0.45175, + "grad_norm": 2.109375, + "grad_norm_var": 0.012308756510416666, + "learning_rate": 0.0001, + "loss": 7.0837, + "loss/crossentropy": 2.4593422412872314, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.22033347189426422, + "step": 7228 + }, + { + "epoch": 0.451875, + "grad_norm": 2.1875, + "grad_norm_var": 0.012352498372395833, + "learning_rate": 0.0001, + "loss": 7.0803, + "loss/crossentropy": 2.1379048824310303, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.212110698223114, + "step": 7230 + }, + { + "epoch": 0.452, + "grad_norm": 2.078125, + "grad_norm_var": 0.009631093343098958, + "learning_rate": 0.0001, + "loss": 7.0067, + "loss/crossentropy": 2.147655963897705, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21026761829853058, + "step": 7232 + }, + { + "epoch": 0.452125, + "grad_norm": 2.09375, + "grad_norm_var": 0.008278147379557291, + "learning_rate": 0.0001, + "loss": 7.2995, + "loss/crossentropy": 2.4965137243270874, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2102838009595871, + "step": 7234 + }, + { + "epoch": 0.45225, + "grad_norm": 2.34375, + "grad_norm_var": 0.007420857747395833, + "learning_rate": 0.0001, + "loss": 7.2283, + "loss/crossentropy": 2.5284390449523926, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2260439619421959, + "step": 7236 + }, + { + "epoch": 0.452375, + "grad_norm": 2.046875, + "grad_norm_var": 0.006136067708333333, + "learning_rate": 0.0001, + "loss": 7.1149, + "loss/crossentropy": 2.124226689338684, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.1935354694724083, + "step": 7238 + }, + { + "epoch": 0.4525, + "grad_norm": 2.046875, + "grad_norm_var": 0.008177693684895833, + "learning_rate": 0.0001, + "loss": 7.1761, + "loss/crossentropy": 2.31568443775177, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19555098563432693, + "step": 7240 + }, + { + "epoch": 0.452625, + "grad_norm": 2.125, + "grad_norm_var": 0.00797119140625, + "learning_rate": 0.0001, + "loss": 7.0197, + "loss/crossentropy": 2.2378604412078857, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2003757283091545, + "step": 7242 + }, + { + "epoch": 0.45275, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008304595947265625, + "learning_rate": 0.0001, + "loss": 7.032, + "loss/crossentropy": 2.0563949942588806, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.1994936764240265, + "step": 7244 + }, + { + "epoch": 0.452875, + "grad_norm": 2.1875, + "grad_norm_var": 0.009273020426432292, + "learning_rate": 0.0001, + "loss": 7.0266, + "loss/crossentropy": 2.1362143754959106, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19745449721813202, + "step": 7246 + }, + { + "epoch": 0.453, + "grad_norm": 2.203125, + "grad_norm_var": 0.009978993733723959, + "learning_rate": 0.0001, + "loss": 7.054, + "loss/crossentropy": 2.285860776901245, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21970739215612411, + "step": 7248 + }, + { + "epoch": 0.453125, + "grad_norm": 2.015625, + "grad_norm_var": 0.010746002197265625, + "learning_rate": 0.0001, + "loss": 7.1889, + "loss/crossentropy": 2.350265145301819, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2033230885863304, + "step": 7250 + }, + { + "epoch": 0.45325, + "grad_norm": 2.234375, + "grad_norm_var": 0.006768544514973958, + "learning_rate": 0.0001, + "loss": 7.212, + "loss/crossentropy": 2.2641106843948364, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20904739946126938, + "step": 7252 + }, + { + "epoch": 0.453375, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0104248046875, + "learning_rate": 0.0001, + "loss": 7.2615, + "loss/crossentropy": 2.3802077770233154, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21182265132665634, + "step": 7254 + }, + { + "epoch": 0.4535, + "grad_norm": 2.34375, + "grad_norm_var": 0.013841756184895833, + "learning_rate": 0.0001, + "loss": 7.0786, + "loss/crossentropy": 2.2918620109558105, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20877155661582947, + "step": 7256 + }, + { + "epoch": 0.453625, + "grad_norm": 2.140625, + "grad_norm_var": 0.0139312744140625, + "learning_rate": 0.0001, + "loss": 7.0746, + "loss/crossentropy": 2.120309591293335, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.19328848272562027, + "step": 7258 + }, + { + "epoch": 0.45375, + "grad_norm": 2.125, + "grad_norm_var": 0.011901601155598959, + "learning_rate": 0.0001, + "loss": 7.2181, + "loss/crossentropy": 2.221208691596985, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21751584112644196, + "step": 7260 + }, + { + "epoch": 0.453875, + "grad_norm": 2.421875, + "grad_norm_var": 0.023978424072265626, + "learning_rate": 0.0001, + "loss": 7.1222, + "loss/crossentropy": 2.4885802268981934, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2257223054766655, + "step": 7262 + }, + { + "epoch": 0.454, + "grad_norm": 2.375, + "grad_norm_var": 0.02561213175455729, + "learning_rate": 0.0001, + "loss": 7.1315, + "loss/crossentropy": 2.1699594259262085, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.21807490289211273, + "step": 7264 + }, + { + "epoch": 0.454125, + "grad_norm": 2.21875, + "grad_norm_var": 0.022607167561848957, + "learning_rate": 0.0001, + "loss": 7.1284, + "loss/crossentropy": 1.8780632615089417, + "loss/hidden": 2.71875, + "loss/jsd": 0.0, + "loss/logits": 0.1910051330924034, + "step": 7266 + }, + { + "epoch": 0.45425, + "grad_norm": 2.125, + "grad_norm_var": 0.02069066365559896, + "learning_rate": 0.0001, + "loss": 7.1184, + "loss/crossentropy": 2.246003210544586, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22277537733316422, + "step": 7268 + }, + { + "epoch": 0.454375, + "grad_norm": 2.375, + "grad_norm_var": 0.017096964518229167, + "learning_rate": 0.0001, + "loss": 7.1533, + "loss/crossentropy": 2.1062055826187134, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.22463850677013397, + "step": 7270 + }, + { + "epoch": 0.4545, + "grad_norm": 2.25, + "grad_norm_var": 0.015620930989583334, + "learning_rate": 0.0001, + "loss": 7.1315, + "loss/crossentropy": 2.35440993309021, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2115459069609642, + "step": 7272 + }, + { + "epoch": 0.454625, + "grad_norm": 2.03125, + "grad_norm_var": 0.01744384765625, + "learning_rate": 0.0001, + "loss": 7.0647, + "loss/crossentropy": 2.358685255050659, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20285096019506454, + "step": 7274 + }, + { + "epoch": 0.45475, + "grad_norm": 2.015625, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 7.0577, + "loss/crossentropy": 2.1627367734909058, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19727136939764023, + "step": 7276 + }, + { + "epoch": 0.454875, + "grad_norm": 2.0625, + "grad_norm_var": 0.015458170572916667, + "learning_rate": 0.0001, + "loss": 7.0459, + "loss/crossentropy": 2.0946096181869507, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.19678431004285812, + "step": 7278 + }, + { + "epoch": 0.455, + "grad_norm": 2.1875, + "grad_norm_var": 0.0161773681640625, + "learning_rate": 0.0001, + "loss": 7.2589, + "loss/crossentropy": 2.244626998901367, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.22719382494688034, + "step": 7280 + }, + { + "epoch": 0.455125, + "grad_norm": 2.296875, + "grad_norm_var": 0.014632161458333333, + "learning_rate": 0.0001, + "loss": 7.3766, + "loss/crossentropy": 2.2402459383010864, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20855902880430222, + "step": 7282 + }, + { + "epoch": 0.45525, + "grad_norm": 2.1875, + "grad_norm_var": 0.016844685872395834, + "learning_rate": 0.0001, + "loss": 7.1058, + "loss/crossentropy": 2.4255205392837524, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1950306072831154, + "step": 7284 + }, + { + "epoch": 0.455375, + "grad_norm": 2.109375, + "grad_norm_var": 0.014286295572916666, + "learning_rate": 0.0001, + "loss": 7.1902, + "loss/crossentropy": 2.4710735082626343, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21613246202468872, + "step": 7286 + }, + { + "epoch": 0.4555, + "grad_norm": 2.203125, + "grad_norm_var": 0.01373291015625, + "learning_rate": 0.0001, + "loss": 7.0767, + "loss/crossentropy": 2.0865633487701416, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19315458834171295, + "step": 7288 + }, + { + "epoch": 0.455625, + "grad_norm": 2.046875, + "grad_norm_var": 0.014232381184895834, + "learning_rate": 0.0001, + "loss": 7.1555, + "loss/crossentropy": 2.431704044342041, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.22103539109230042, + "step": 7290 + }, + { + "epoch": 0.45575, + "grad_norm": 2.109375, + "grad_norm_var": 0.013212076822916667, + "learning_rate": 0.0001, + "loss": 7.1897, + "loss/crossentropy": 2.2944241762161255, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20230845361948013, + "step": 7292 + }, + { + "epoch": 0.455875, + "grad_norm": 2.125, + "grad_norm_var": 0.012744140625, + "learning_rate": 0.0001, + "loss": 7.2919, + "loss/crossentropy": 2.1923948526382446, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2351241260766983, + "step": 7294 + }, + { + "epoch": 0.456, + "grad_norm": 2.21875, + "grad_norm_var": 0.013498687744140625, + "learning_rate": 0.0001, + "loss": 7.0956, + "loss/crossentropy": 1.9904609322547913, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.19044706225395203, + "step": 7296 + }, + { + "epoch": 0.456125, + "grad_norm": 2.125, + "grad_norm_var": 0.016371409098307293, + "learning_rate": 0.0001, + "loss": 7.2638, + "loss/crossentropy": 2.392876386642456, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21254249662160873, + "step": 7298 + }, + { + "epoch": 0.45625, + "grad_norm": 2.03125, + "grad_norm_var": 0.01687800089518229, + "learning_rate": 0.0001, + "loss": 7.0909, + "loss/crossentropy": 1.9285590052604675, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.18553440272808075, + "step": 7300 + }, + { + "epoch": 0.456375, + "grad_norm": 2.09375, + "grad_norm_var": 0.016717274983723957, + "learning_rate": 0.0001, + "loss": 7.1137, + "loss/crossentropy": 2.4444403648376465, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.22047097980976105, + "step": 7302 + }, + { + "epoch": 0.4565, + "grad_norm": 2.015625, + "grad_norm_var": 0.01741511027018229, + "learning_rate": 0.0001, + "loss": 7.0704, + "loss/crossentropy": 2.4315890073776245, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21392318606376648, + "step": 7304 + }, + { + "epoch": 0.456625, + "grad_norm": 2.140625, + "grad_norm_var": 0.013889312744140625, + "learning_rate": 0.0001, + "loss": 7.1225, + "loss/crossentropy": 2.1737417578697205, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.21062488108873367, + "step": 7306 + }, + { + "epoch": 0.45675, + "grad_norm": 2.078125, + "grad_norm_var": 0.015036773681640626, + "learning_rate": 0.0001, + "loss": 7.0151, + "loss/crossentropy": 1.8432039022445679, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20271048694849014, + "step": 7308 + }, + { + "epoch": 0.456875, + "grad_norm": 2.171875, + "grad_norm_var": 0.014113108317057291, + "learning_rate": 0.0001, + "loss": 7.0773, + "loss/crossentropy": 2.1769769191741943, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20057760924100876, + "step": 7310 + }, + { + "epoch": 0.457, + "grad_norm": 2.453125, + "grad_norm_var": 0.017267862955729168, + "learning_rate": 0.0001, + "loss": 7.2829, + "loss/crossentropy": 2.502546191215515, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21403612941503525, + "step": 7312 + }, + { + "epoch": 0.457125, + "grad_norm": 2.09375, + "grad_norm_var": 0.012547810872395834, + "learning_rate": 0.0001, + "loss": 7.1304, + "loss/crossentropy": 2.3192015886306763, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21322068572044373, + "step": 7314 + }, + { + "epoch": 0.45725, + "grad_norm": 2.140625, + "grad_norm_var": 0.0109771728515625, + "learning_rate": 0.0001, + "loss": 7.0573, + "loss/crossentropy": 2.21256947517395, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.19907121360301971, + "step": 7316 + }, + { + "epoch": 0.457375, + "grad_norm": 2.078125, + "grad_norm_var": 0.011083984375, + "learning_rate": 0.0001, + "loss": 7.1052, + "loss/crossentropy": 2.3876004219055176, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21536500751972198, + "step": 7318 + }, + { + "epoch": 0.4575, + "grad_norm": 2.21875, + "grad_norm_var": 0.0110748291015625, + "learning_rate": 0.0001, + "loss": 7.1542, + "loss/crossentropy": 2.3090778589248657, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2097294181585312, + "step": 7320 + }, + { + "epoch": 0.457625, + "grad_norm": 2.140625, + "grad_norm_var": 0.011226399739583334, + "learning_rate": 0.0001, + "loss": 7.1163, + "loss/crossentropy": 2.3780629634857178, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2092108651995659, + "step": 7322 + }, + { + "epoch": 0.45775, + "grad_norm": 2.09375, + "grad_norm_var": 0.009137980143229167, + "learning_rate": 0.0001, + "loss": 7.1624, + "loss/crossentropy": 2.3224358558654785, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20188771933317184, + "step": 7324 + }, + { + "epoch": 0.457875, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014707183837890625, + "learning_rate": 0.0001, + "loss": 7.1187, + "loss/crossentropy": 2.044081926345825, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.18789289891719818, + "step": 7326 + }, + { + "epoch": 0.458, + "grad_norm": 2.375, + "grad_norm_var": 0.012182362874348958, + "learning_rate": 0.0001, + "loss": 7.1447, + "loss/crossentropy": 2.380860447883606, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20718378573656082, + "step": 7328 + }, + { + "epoch": 0.458125, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0149658203125, + "learning_rate": 0.0001, + "loss": 7.2424, + "loss/crossentropy": 2.1657201051712036, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.19616171717643738, + "step": 7330 + }, + { + "epoch": 0.45825, + "grad_norm": 2.078125, + "grad_norm_var": 0.017341105143229167, + "learning_rate": 0.0001, + "loss": 6.9994, + "loss/crossentropy": 2.148615837097168, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20785433053970337, + "step": 7332 + }, + { + "epoch": 0.458375, + "grad_norm": 2.28125, + "grad_norm_var": 0.018513997395833332, + "learning_rate": 0.0001, + "loss": 7.1808, + "loss/crossentropy": 2.2177847623825073, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21924154460430145, + "step": 7334 + }, + { + "epoch": 0.4585, + "grad_norm": 2.046875, + "grad_norm_var": 0.020511881510416666, + "learning_rate": 0.0001, + "loss": 7.1623, + "loss/crossentropy": 2.2308239936828613, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2200755625963211, + "step": 7336 + }, + { + "epoch": 0.458625, + "grad_norm": 2.140625, + "grad_norm_var": 0.020857747395833334, + "learning_rate": 0.0001, + "loss": 7.1733, + "loss/crossentropy": 1.9752941727638245, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19304338842630386, + "step": 7338 + }, + { + "epoch": 0.45875, + "grad_norm": 2.34375, + "grad_norm_var": 0.027058919270833332, + "learning_rate": 0.0001, + "loss": 7.3887, + "loss/crossentropy": 2.253862977027893, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21588093787431717, + "step": 7340 + }, + { + "epoch": 0.458875, + "grad_norm": 2.15625, + "grad_norm_var": 0.022045644124348958, + "learning_rate": 0.0001, + "loss": 7.1725, + "loss/crossentropy": 2.3387417793273926, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20970949530601501, + "step": 7342 + }, + { + "epoch": 0.459, + "grad_norm": 2.109375, + "grad_norm_var": 0.024379221598307292, + "learning_rate": 0.0001, + "loss": 7.222, + "loss/crossentropy": 2.1245256066322327, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.22371243685483932, + "step": 7344 + }, + { + "epoch": 0.459125, + "grad_norm": 2.328125, + "grad_norm_var": 0.023216756184895833, + "learning_rate": 0.0001, + "loss": 7.257, + "loss/crossentropy": 2.2408164739608765, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2288973107933998, + "step": 7346 + }, + { + "epoch": 0.45925, + "grad_norm": 1.8515625, + "grad_norm_var": 0.02879613240559896, + "learning_rate": 0.0001, + "loss": 7.0951, + "loss/crossentropy": 2.180490016937256, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21135683357715607, + "step": 7348 + }, + { + "epoch": 0.459375, + "grad_norm": 2.203125, + "grad_norm_var": 0.02787043253580729, + "learning_rate": 0.0001, + "loss": 7.1622, + "loss/crossentropy": 2.359486937522888, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21552064269781113, + "step": 7350 + }, + { + "epoch": 0.4595, + "grad_norm": 2.25, + "grad_norm_var": 0.02600886027018229, + "learning_rate": 0.0001, + "loss": 7.2729, + "loss/crossentropy": 2.4955438375473022, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2092251032590866, + "step": 7352 + }, + { + "epoch": 0.459625, + "grad_norm": 2.03125, + "grad_norm_var": 0.026444244384765624, + "learning_rate": 0.0001, + "loss": 7.1748, + "loss/crossentropy": 2.3510589599609375, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20734377205371857, + "step": 7354 + }, + { + "epoch": 0.45975, + "grad_norm": 2.203125, + "grad_norm_var": 0.018332672119140626, + "learning_rate": 0.0001, + "loss": 7.1075, + "loss/crossentropy": 2.3306682109832764, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20482201874256134, + "step": 7356 + }, + { + "epoch": 0.459875, + "grad_norm": 1.984375, + "grad_norm_var": 0.01725031534830729, + "learning_rate": 0.0001, + "loss": 7.1674, + "loss/crossentropy": 2.3967502117156982, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21556542068719864, + "step": 7358 + }, + { + "epoch": 0.46, + "grad_norm": 2.265625, + "grad_norm_var": 0.017073313395182293, + "learning_rate": 0.0001, + "loss": 7.0717, + "loss/crossentropy": 2.274154245853424, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21294216066598892, + "step": 7360 + }, + { + "epoch": 0.460125, + "grad_norm": 2.140625, + "grad_norm_var": 0.013783518473307292, + "learning_rate": 0.0001, + "loss": 7.2142, + "loss/crossentropy": 2.230677008628845, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2010795623064041, + "step": 7362 + }, + { + "epoch": 0.46025, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009504954020182291, + "learning_rate": 0.0001, + "loss": 7.0379, + "loss/crossentropy": 2.353756308555603, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22095441818237305, + "step": 7364 + }, + { + "epoch": 0.460375, + "grad_norm": 2.140625, + "grad_norm_var": 0.009458160400390625, + "learning_rate": 0.0001, + "loss": 6.9673, + "loss/crossentropy": 2.085910201072693, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.18464379012584686, + "step": 7366 + }, + { + "epoch": 0.4605, + "grad_norm": 1.984375, + "grad_norm_var": 0.009199778238932291, + "learning_rate": 0.0001, + "loss": 6.8746, + "loss/crossentropy": 2.0555408000946045, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2172028347849846, + "step": 7368 + }, + { + "epoch": 0.460625, + "grad_norm": 2.15625, + "grad_norm_var": 0.009907786051432292, + "learning_rate": 0.0001, + "loss": 7.0269, + "loss/crossentropy": 2.3041187524795532, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.21645274013280869, + "step": 7370 + }, + { + "epoch": 0.46075, + "grad_norm": 2.015625, + "grad_norm_var": 0.009531402587890625, + "learning_rate": 0.0001, + "loss": 6.9228, + "loss/crossentropy": 1.9540690183639526, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.184081070125103, + "step": 7372 + }, + { + "epoch": 0.460875, + "grad_norm": 2.109375, + "grad_norm_var": 0.009034983317057292, + "learning_rate": 0.0001, + "loss": 7.1386, + "loss/crossentropy": 2.4415656328201294, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20297012478113174, + "step": 7374 + }, + { + "epoch": 0.461, + "grad_norm": 2.078125, + "grad_norm_var": 0.006528472900390625, + "learning_rate": 0.0001, + "loss": 7.1107, + "loss/crossentropy": 2.086025834083557, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19494184106588364, + "step": 7376 + }, + { + "epoch": 0.461125, + "grad_norm": 2.109375, + "grad_norm_var": 0.0063860575358072914, + "learning_rate": 0.0001, + "loss": 7.1613, + "loss/crossentropy": 2.236558198928833, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21862926334142685, + "step": 7378 + }, + { + "epoch": 0.46125, + "grad_norm": 2.109375, + "grad_norm_var": 0.006243642171223958, + "learning_rate": 0.0001, + "loss": 7.1739, + "loss/crossentropy": 2.0421109199523926, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2004089578986168, + "step": 7380 + }, + { + "epoch": 0.461375, + "grad_norm": 2.109375, + "grad_norm_var": 0.006573232014973959, + "learning_rate": 0.0001, + "loss": 7.0469, + "loss/crossentropy": 2.216292977333069, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.1917620524764061, + "step": 7382 + }, + { + "epoch": 0.4615, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 7.1088, + "loss/crossentropy": 2.209986925125122, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20837366580963135, + "step": 7384 + }, + { + "epoch": 0.461625, + "grad_norm": 2.609375, + "grad_norm_var": 0.022809855143229165, + "learning_rate": 0.0001, + "loss": 7.3591, + "loss/crossentropy": 2.3474618196487427, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.22284895181655884, + "step": 7386 + }, + { + "epoch": 0.46175, + "grad_norm": 2.0, + "grad_norm_var": 0.0231597900390625, + "learning_rate": 0.0001, + "loss": 7.0282, + "loss/crossentropy": 2.0832881927490234, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.17219309508800507, + "step": 7388 + }, + { + "epoch": 0.461875, + "grad_norm": 2.21875, + "grad_norm_var": 0.023249308268229168, + "learning_rate": 0.0001, + "loss": 7.1468, + "loss/crossentropy": 2.2276368141174316, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.19344830513000488, + "step": 7390 + }, + { + "epoch": 0.462, + "grad_norm": 2.296875, + "grad_norm_var": 0.025169881184895833, + "learning_rate": 0.0001, + "loss": 7.0684, + "loss/crossentropy": 2.4470431804656982, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22533931583166122, + "step": 7392 + }, + { + "epoch": 0.462125, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02926813761393229, + "learning_rate": 0.0001, + "loss": 7.0658, + "loss/crossentropy": 2.2905768156051636, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.20665580034255981, + "step": 7394 + }, + { + "epoch": 0.46225, + "grad_norm": 2.359375, + "grad_norm_var": 0.0297760009765625, + "learning_rate": 0.0001, + "loss": 7.1369, + "loss/crossentropy": 2.167145550251007, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21655890345573425, + "step": 7396 + }, + { + "epoch": 0.462375, + "grad_norm": 1.984375, + "grad_norm_var": 0.03216552734375, + "learning_rate": 0.0001, + "loss": 7.0859, + "loss/crossentropy": 2.15854412317276, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20027539879083633, + "step": 7398 + }, + { + "epoch": 0.4625, + "grad_norm": 2.28125, + "grad_norm_var": 0.03135350545247396, + "learning_rate": 0.0001, + "loss": 7.1261, + "loss/crossentropy": 2.1508615016937256, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.18814773857593536, + "step": 7400 + }, + { + "epoch": 0.462625, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0194732666015625, + "learning_rate": 0.0001, + "loss": 7.134, + "loss/crossentropy": 1.842678189277649, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.18930108845233917, + "step": 7402 + }, + { + "epoch": 0.46275, + "grad_norm": 2.125, + "grad_norm_var": 0.018400065104166665, + "learning_rate": 0.0001, + "loss": 7.2671, + "loss/crossentropy": 2.4093555212020874, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21225199848413467, + "step": 7404 + }, + { + "epoch": 0.462875, + "grad_norm": 2.265625, + "grad_norm_var": 0.019188435872395833, + "learning_rate": 0.0001, + "loss": 7.2534, + "loss/crossentropy": 1.9698211550712585, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21386128664016724, + "step": 7406 + }, + { + "epoch": 0.463, + "grad_norm": 1.9375, + "grad_norm_var": 0.0200836181640625, + "learning_rate": 0.0001, + "loss": 7.2251, + "loss/crossentropy": 2.4693849086761475, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20892548561096191, + "step": 7408 + }, + { + "epoch": 0.463125, + "grad_norm": 2.0625, + "grad_norm_var": 0.01602350870768229, + "learning_rate": 0.0001, + "loss": 7.0863, + "loss/crossentropy": 2.1525356769561768, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20484411716461182, + "step": 7410 + }, + { + "epoch": 0.46325, + "grad_norm": 2.171875, + "grad_norm_var": 0.023339589436848957, + "learning_rate": 0.0001, + "loss": 7.0279, + "loss/crossentropy": 2.3783161640167236, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.19495395570993423, + "step": 7412 + }, + { + "epoch": 0.463375, + "grad_norm": 1.9921875, + "grad_norm_var": 0.022850545247395833, + "learning_rate": 0.0001, + "loss": 7.2075, + "loss/crossentropy": 2.3271056413650513, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19265718758106232, + "step": 7414 + }, + { + "epoch": 0.4635, + "grad_norm": 3.0625, + "grad_norm_var": 3.065482584635417, + "learning_rate": 0.0001, + "loss": 7.6398, + "loss/crossentropy": 2.133812189102173, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20769577473402023, + "step": 7416 + }, + { + "epoch": 0.463625, + "grad_norm": 2.09375, + "grad_norm_var": 3.042229970296224, + "learning_rate": 0.0001, + "loss": 7.213, + "loss/crossentropy": 2.110043227672577, + "loss/hidden": 2.9375, + "loss/jsd": 0.0, + "loss/logits": 0.21482352912425995, + "step": 7418 + }, + { + "epoch": 0.46375, + "grad_norm": 2.125, + "grad_norm_var": 3.038519032796224, + "learning_rate": 0.0001, + "loss": 7.1561, + "loss/crossentropy": 2.31582248210907, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19608522206544876, + "step": 7420 + }, + { + "epoch": 0.463875, + "grad_norm": 2.0625, + "grad_norm_var": 3.049450429280599, + "learning_rate": 0.0001, + "loss": 7.3156, + "loss/crossentropy": 2.19181752204895, + "loss/hidden": 2.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2320917323231697, + "step": 7422 + }, + { + "epoch": 0.464, + "grad_norm": 2.21875, + "grad_norm_var": 3.0374794006347656, + "learning_rate": 0.0001, + "loss": 7.1618, + "loss/crossentropy": 2.466021180152893, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20255694538354874, + "step": 7424 + }, + { + "epoch": 0.464125, + "grad_norm": 2.15625, + "grad_norm_var": 3.039378865559896, + "learning_rate": 0.0001, + "loss": 7.1428, + "loss/crossentropy": 2.425376534461975, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19863475114107132, + "step": 7426 + }, + { + "epoch": 0.46425, + "grad_norm": 2.265625, + "grad_norm_var": 3.04412841796875, + "learning_rate": 0.0001, + "loss": 7.0199, + "loss/crossentropy": 2.244443416595459, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.19751153886318207, + "step": 7428 + }, + { + "epoch": 0.464375, + "grad_norm": 1.9609375, + "grad_norm_var": 3.063792928059896, + "learning_rate": 0.0001, + "loss": 7.0883, + "loss/crossentropy": 2.2773425579071045, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2200421243906021, + "step": 7430 + }, + { + "epoch": 0.4645, + "grad_norm": 2.3125, + "grad_norm_var": 0.01649169921875, + "learning_rate": 0.0001, + "loss": 7.1049, + "loss/crossentropy": 2.173538327217102, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21017886698246002, + "step": 7432 + }, + { + "epoch": 0.464625, + "grad_norm": 2.15625, + "grad_norm_var": 0.013334147135416667, + "learning_rate": 0.0001, + "loss": 7.2421, + "loss/crossentropy": 2.190195083618164, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2085673287510872, + "step": 7434 + }, + { + "epoch": 0.46475, + "grad_norm": 2.015625, + "grad_norm_var": 0.014900716145833333, + "learning_rate": 0.0001, + "loss": 7.1757, + "loss/crossentropy": 2.2361714839935303, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.20774667710065842, + "step": 7436 + }, + { + "epoch": 0.464875, + "grad_norm": 2.03125, + "grad_norm_var": 0.014533487955729167, + "learning_rate": 0.0001, + "loss": 7.0734, + "loss/crossentropy": 2.3481364250183105, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19645172357559204, + "step": 7438 + }, + { + "epoch": 0.465, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019789377848307293, + "learning_rate": 0.0001, + "loss": 7.0093, + "loss/crossentropy": 1.8984848260879517, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.19228964298963547, + "step": 7440 + }, + { + "epoch": 0.465125, + "grad_norm": 2.234375, + "grad_norm_var": 0.019453938802083334, + "learning_rate": 0.0001, + "loss": 6.9896, + "loss/crossentropy": 2.2871270179748535, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2178485319018364, + "step": 7442 + }, + { + "epoch": 0.46525, + "grad_norm": 2.09375, + "grad_norm_var": 0.01668701171875, + "learning_rate": 0.0001, + "loss": 7.0334, + "loss/crossentropy": 2.2983537912368774, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.1965767741203308, + "step": 7444 + }, + { + "epoch": 0.465375, + "grad_norm": 2.03125, + "grad_norm_var": 0.014768218994140625, + "learning_rate": 0.0001, + "loss": 7.2321, + "loss/crossentropy": 2.178062319755554, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.1993517056107521, + "step": 7446 + }, + { + "epoch": 0.4655, + "grad_norm": 1.921875, + "grad_norm_var": 0.013632965087890626, + "learning_rate": 0.0001, + "loss": 7.0158, + "loss/crossentropy": 2.1353349685668945, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.1889559030532837, + "step": 7448 + }, + { + "epoch": 0.465625, + "grad_norm": 2.421875, + "grad_norm_var": 0.02373224894205729, + "learning_rate": 0.0001, + "loss": 7.2622, + "loss/crossentropy": 2.2076762914657593, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.2045726329088211, + "step": 7450 + }, + { + "epoch": 0.46575, + "grad_norm": 2.15625, + "grad_norm_var": 0.022900136311848958, + "learning_rate": 0.0001, + "loss": 7.1263, + "loss/crossentropy": 2.4784727096557617, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.21691476553678513, + "step": 7452 + }, + { + "epoch": 0.465875, + "grad_norm": 2.125, + "grad_norm_var": 0.02217585245768229, + "learning_rate": 0.0001, + "loss": 7.0873, + "loss/crossentropy": 1.949112594127655, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21352668851613998, + "step": 7454 + }, + { + "epoch": 0.466, + "grad_norm": 2.046875, + "grad_norm_var": 0.017710113525390626, + "learning_rate": 0.0001, + "loss": 7.0893, + "loss/crossentropy": 1.9374956488609314, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19351773709058762, + "step": 7456 + }, + { + "epoch": 0.466125, + "grad_norm": 2.171875, + "grad_norm_var": 0.017227935791015624, + "learning_rate": 0.0001, + "loss": 7.107, + "loss/crossentropy": 2.6141308546066284, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22147035598754883, + "step": 7458 + }, + { + "epoch": 0.46625, + "grad_norm": 2.109375, + "grad_norm_var": 0.017411041259765624, + "learning_rate": 0.0001, + "loss": 7.2292, + "loss/crossentropy": 2.4253780841827393, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21378996968269348, + "step": 7460 + }, + { + "epoch": 0.466375, + "grad_norm": 2.125, + "grad_norm_var": 0.017050933837890626, + "learning_rate": 0.0001, + "loss": 7.1957, + "loss/crossentropy": 2.009346067905426, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2008848488330841, + "step": 7462 + }, + { + "epoch": 0.4665, + "grad_norm": 2.359375, + "grad_norm_var": 0.01788508097330729, + "learning_rate": 0.0001, + "loss": 7.2114, + "loss/crossentropy": 2.048809766769409, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.19820420444011688, + "step": 7464 + }, + { + "epoch": 0.466625, + "grad_norm": 2.03125, + "grad_norm_var": 0.009893544514973958, + "learning_rate": 0.0001, + "loss": 7.2132, + "loss/crossentropy": 2.3207257986068726, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.20752790570259094, + "step": 7466 + }, + { + "epoch": 0.46675, + "grad_norm": 2.15625, + "grad_norm_var": 0.010607655843098958, + "learning_rate": 0.0001, + "loss": 7.0494, + "loss/crossentropy": 2.112247109413147, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.19827520847320557, + "step": 7468 + }, + { + "epoch": 0.466875, + "grad_norm": 2.125, + "grad_norm_var": 0.015447743733723958, + "learning_rate": 0.0001, + "loss": 7.1738, + "loss/crossentropy": 2.2079219818115234, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22125893086194992, + "step": 7470 + }, + { + "epoch": 0.467, + "grad_norm": 2.09375, + "grad_norm_var": 0.013386027018229166, + "learning_rate": 0.0001, + "loss": 7.0565, + "loss/crossentropy": 2.2796353101730347, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.20792751014232635, + "step": 7472 + }, + { + "epoch": 0.467125, + "grad_norm": 2.09375, + "grad_norm_var": 0.0126129150390625, + "learning_rate": 0.0001, + "loss": 7.1541, + "loss/crossentropy": 2.4691073894500732, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2315238192677498, + "step": 7474 + }, + { + "epoch": 0.46725, + "grad_norm": 2.09375, + "grad_norm_var": 0.012727864583333333, + "learning_rate": 0.0001, + "loss": 6.9937, + "loss/crossentropy": 2.2356735467910767, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19917455315589905, + "step": 7476 + }, + { + "epoch": 0.467375, + "grad_norm": 2.140625, + "grad_norm_var": 0.0129058837890625, + "learning_rate": 0.0001, + "loss": 7.1406, + "loss/crossentropy": 2.2866714000701904, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21963119506835938, + "step": 7478 + }, + { + "epoch": 0.4675, + "grad_norm": 2.140625, + "grad_norm_var": 0.0102935791015625, + "learning_rate": 0.0001, + "loss": 7.0201, + "loss/crossentropy": 2.419238805770874, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21799220889806747, + "step": 7480 + }, + { + "epoch": 0.467625, + "grad_norm": 2.171875, + "grad_norm_var": 0.0093658447265625, + "learning_rate": 0.0001, + "loss": 7.0075, + "loss/crossentropy": 2.33626389503479, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.20474457740783691, + "step": 7482 + }, + { + "epoch": 0.46775, + "grad_norm": 2.0, + "grad_norm_var": 0.009691365559895833, + "learning_rate": 0.0001, + "loss": 7.1476, + "loss/crossentropy": 2.276577591896057, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2041800171136856, + "step": 7484 + }, + { + "epoch": 0.467875, + "grad_norm": 2.375, + "grad_norm_var": 0.008202107747395833, + "learning_rate": 0.0001, + "loss": 7.1914, + "loss/crossentropy": 2.279148817062378, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.21196125447750092, + "step": 7486 + }, + { + "epoch": 0.468, + "grad_norm": 1.921875, + "grad_norm_var": 0.01376953125, + "learning_rate": 0.0001, + "loss": 7.0034, + "loss/crossentropy": 2.241237759590149, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.19696836173534393, + "step": 7488 + }, + { + "epoch": 0.468125, + "grad_norm": 2.140625, + "grad_norm_var": 0.01412353515625, + "learning_rate": 0.0001, + "loss": 7.1467, + "loss/crossentropy": 2.1839855909347534, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.19702082127332687, + "step": 7490 + }, + { + "epoch": 0.46825, + "grad_norm": 2.078125, + "grad_norm_var": 0.013988240559895834, + "learning_rate": 0.0001, + "loss": 7.133, + "loss/crossentropy": 2.1689497232437134, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2228970304131508, + "step": 7492 + }, + { + "epoch": 0.468375, + "grad_norm": 2.25, + "grad_norm_var": 0.013060506184895833, + "learning_rate": 0.0001, + "loss": 7.1162, + "loss/crossentropy": 2.161349654197693, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21048546582460403, + "step": 7494 + }, + { + "epoch": 0.4685, + "grad_norm": 2.1875, + "grad_norm_var": 0.016910807291666666, + "learning_rate": 0.0001, + "loss": 7.1184, + "loss/crossentropy": 2.362098455429077, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20488198101520538, + "step": 7496 + }, + { + "epoch": 0.468625, + "grad_norm": 2.203125, + "grad_norm_var": 0.0183502197265625, + "learning_rate": 0.0001, + "loss": 6.9562, + "loss/crossentropy": 1.9627411365509033, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19364163279533386, + "step": 7498 + }, + { + "epoch": 0.46875, + "grad_norm": 2.09375, + "grad_norm_var": 0.01845703125, + "learning_rate": 0.0001, + "loss": 7.062, + "loss/crossentropy": 2.4354528188705444, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21706019341945648, + "step": 7500 + }, + { + "epoch": 0.468875, + "grad_norm": 2.828125, + "grad_norm_var": 0.04521484375, + "learning_rate": 0.0001, + "loss": 7.2507, + "loss/crossentropy": 2.263838768005371, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20706459134817123, + "step": 7502 + }, + { + "epoch": 0.469, + "grad_norm": 2.375, + "grad_norm_var": 0.0410552978515625, + "learning_rate": 0.0001, + "loss": 6.9699, + "loss/crossentropy": 2.2720807790756226, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21732699871063232, + "step": 7504 + }, + { + "epoch": 0.469125, + "grad_norm": 2.3125, + "grad_norm_var": 0.0393707275390625, + "learning_rate": 0.0001, + "loss": 7.3032, + "loss/crossentropy": 2.527849555015564, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2073214203119278, + "step": 7506 + }, + { + "epoch": 0.46925, + "grad_norm": 2.34375, + "grad_norm_var": 0.0349761962890625, + "learning_rate": 0.0001, + "loss": 7.1871, + "loss/crossentropy": 2.335466742515564, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21534393727779388, + "step": 7508 + }, + { + "epoch": 0.469375, + "grad_norm": 2.0625, + "grad_norm_var": 0.04195556640625, + "learning_rate": 0.0001, + "loss": 7.0814, + "loss/crossentropy": 2.4958906173706055, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20870383828878403, + "step": 7510 + }, + { + "epoch": 0.4695, + "grad_norm": 1.9921875, + "grad_norm_var": 0.046567535400390624, + "learning_rate": 0.0001, + "loss": 7.1191, + "loss/crossentropy": 2.233251214027405, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19957691431045532, + "step": 7512 + }, + { + "epoch": 0.469625, + "grad_norm": 2.28125, + "grad_norm_var": 0.04698867797851562, + "learning_rate": 0.0001, + "loss": 7.2159, + "loss/crossentropy": 2.1861671209335327, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2184196561574936, + "step": 7514 + }, + { + "epoch": 0.46975, + "grad_norm": 2.21875, + "grad_norm_var": 0.041473134358723955, + "learning_rate": 0.0001, + "loss": 7.288, + "loss/crossentropy": 2.273300290107727, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2080041542649269, + "step": 7516 + }, + { + "epoch": 0.469875, + "grad_norm": 1.921875, + "grad_norm_var": 0.02493260701497396, + "learning_rate": 0.0001, + "loss": 7.1214, + "loss/crossentropy": 2.286626935005188, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.23020273447036743, + "step": 7518 + }, + { + "epoch": 0.47, + "grad_norm": 2.25, + "grad_norm_var": 0.01982599894205729, + "learning_rate": 0.0001, + "loss": 7.1163, + "loss/crossentropy": 2.25377357006073, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19812580943107605, + "step": 7520 + }, + { + "epoch": 0.470125, + "grad_norm": 2.25, + "grad_norm_var": 0.01633478800455729, + "learning_rate": 0.0001, + "loss": 6.9896, + "loss/crossentropy": 2.268404960632324, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22042839229106903, + "step": 7522 + }, + { + "epoch": 0.47025, + "grad_norm": 1.9375, + "grad_norm_var": 0.016017405192057292, + "learning_rate": 0.0001, + "loss": 7.0678, + "loss/crossentropy": 2.1755711436271667, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21114220470190048, + "step": 7524 + }, + { + "epoch": 0.470375, + "grad_norm": 2.3125, + "grad_norm_var": 0.017093658447265625, + "learning_rate": 0.0001, + "loss": 7.1598, + "loss/crossentropy": 2.4401893615722656, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.22914503514766693, + "step": 7526 + }, + { + "epoch": 0.4705, + "grad_norm": 2.046875, + "grad_norm_var": 0.01617431640625, + "learning_rate": 0.0001, + "loss": 7.2893, + "loss/crossentropy": 2.613025188446045, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21846508979797363, + "step": 7528 + }, + { + "epoch": 0.470625, + "grad_norm": 2.359375, + "grad_norm_var": 0.019331868489583334, + "learning_rate": 0.0001, + "loss": 7.2158, + "loss/crossentropy": 2.3140453100204468, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.22654858231544495, + "step": 7530 + }, + { + "epoch": 0.47075, + "grad_norm": 2.09375, + "grad_norm_var": 0.0232330322265625, + "learning_rate": 0.0001, + "loss": 7.0316, + "loss/crossentropy": 2.0402657985687256, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.18750952184200287, + "step": 7532 + }, + { + "epoch": 0.470875, + "grad_norm": 2.046875, + "grad_norm_var": 0.01988525390625, + "learning_rate": 0.0001, + "loss": 7.1331, + "loss/crossentropy": 2.567373514175415, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.22395136952400208, + "step": 7534 + }, + { + "epoch": 0.471, + "grad_norm": 2.140625, + "grad_norm_var": 0.023465983072916665, + "learning_rate": 0.0001, + "loss": 7.2494, + "loss/crossentropy": 2.3188188076019287, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21441180258989334, + "step": 7536 + }, + { + "epoch": 0.471125, + "grad_norm": 2.296875, + "grad_norm_var": 0.02314453125, + "learning_rate": 0.0001, + "loss": 7.0734, + "loss/crossentropy": 2.2127068042755127, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.1956922933459282, + "step": 7538 + }, + { + "epoch": 0.47125, + "grad_norm": 2.015625, + "grad_norm_var": 0.021028645833333335, + "learning_rate": 0.0001, + "loss": 6.9942, + "loss/crossentropy": 2.3275548219680786, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21980319917201996, + "step": 7540 + }, + { + "epoch": 0.471375, + "grad_norm": 2.046875, + "grad_norm_var": 0.02574462890625, + "learning_rate": 0.0001, + "loss": 7.2398, + "loss/crossentropy": 2.232550859451294, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.18873104453086853, + "step": 7542 + }, + { + "epoch": 0.4715, + "grad_norm": 2.140625, + "grad_norm_var": 0.026496378580729167, + "learning_rate": 0.0001, + "loss": 7.1301, + "loss/crossentropy": 2.1725897789001465, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.20075791329145432, + "step": 7544 + }, + { + "epoch": 0.471625, + "grad_norm": 2.046875, + "grad_norm_var": 0.023665364583333334, + "learning_rate": 0.0001, + "loss": 7.0794, + "loss/crossentropy": 2.0849578976631165, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20648343861103058, + "step": 7546 + }, + { + "epoch": 0.47175, + "grad_norm": 2.140625, + "grad_norm_var": 0.020296223958333335, + "learning_rate": 0.0001, + "loss": 7.0611, + "loss/crossentropy": 2.085715174674988, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2036331295967102, + "step": 7548 + }, + { + "epoch": 0.471875, + "grad_norm": 2.15625, + "grad_norm_var": 0.019319661458333335, + "learning_rate": 0.0001, + "loss": 7.1509, + "loss/crossentropy": 1.9772019982337952, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.19998165220022202, + "step": 7550 + }, + { + "epoch": 0.472, + "grad_norm": 2.0625, + "grad_norm_var": 0.016141764322916665, + "learning_rate": 0.0001, + "loss": 7.0461, + "loss/crossentropy": 2.2595661878585815, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20990056544542313, + "step": 7552 + }, + { + "epoch": 0.472125, + "grad_norm": 2.046875, + "grad_norm_var": 0.016673787434895834, + "learning_rate": 0.0001, + "loss": 7.0228, + "loss/crossentropy": 2.2455222606658936, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2194531187415123, + "step": 7554 + }, + { + "epoch": 0.47225, + "grad_norm": 2.140625, + "grad_norm_var": 0.014567057291666666, + "learning_rate": 0.0001, + "loss": 7.1894, + "loss/crossentropy": 2.240777611732483, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20595575124025345, + "step": 7556 + }, + { + "epoch": 0.472375, + "grad_norm": 2.109375, + "grad_norm_var": 0.008161417643229167, + "learning_rate": 0.0001, + "loss": 7.06, + "loss/crossentropy": 2.1476142406463623, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.18031620979309082, + "step": 7558 + }, + { + "epoch": 0.4725, + "grad_norm": 2.109375, + "grad_norm_var": 0.008275349934895834, + "learning_rate": 0.0001, + "loss": 7.0674, + "loss/crossentropy": 2.2802597284317017, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20469427108764648, + "step": 7560 + }, + { + "epoch": 0.472625, + "grad_norm": 2.109375, + "grad_norm_var": 0.007645670572916667, + "learning_rate": 0.0001, + "loss": 7.0796, + "loss/crossentropy": 2.3202688694000244, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2028881385922432, + "step": 7562 + }, + { + "epoch": 0.47275, + "grad_norm": 2.109375, + "grad_norm_var": 0.00732421875, + "learning_rate": 0.0001, + "loss": 7.0198, + "loss/crossentropy": 2.185010552406311, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.22524073719978333, + "step": 7564 + }, + { + "epoch": 0.472875, + "grad_norm": 2.421875, + "grad_norm_var": 0.01376953125, + "learning_rate": 0.0001, + "loss": 7.1435, + "loss/crossentropy": 2.1675198078155518, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.19807401299476624, + "step": 7566 + }, + { + "epoch": 0.473, + "grad_norm": 1.890625, + "grad_norm_var": 0.016803995768229166, + "learning_rate": 0.0001, + "loss": 7.0201, + "loss/crossentropy": 2.437238574028015, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2327703759074211, + "step": 7568 + }, + { + "epoch": 0.473125, + "grad_norm": 2.09375, + "grad_norm_var": 0.0144927978515625, + "learning_rate": 0.0001, + "loss": 6.9488, + "loss/crossentropy": 2.4254690408706665, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.21333793550729752, + "step": 7570 + }, + { + "epoch": 0.47325, + "grad_norm": 2.046875, + "grad_norm_var": 0.016276041666666668, + "learning_rate": 0.0001, + "loss": 7.3178, + "loss/crossentropy": 2.5538944005966187, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.21501559019088745, + "step": 7572 + }, + { + "epoch": 0.473375, + "grad_norm": 2.078125, + "grad_norm_var": 0.01484375, + "learning_rate": 0.0001, + "loss": 7.1061, + "loss/crossentropy": 2.2689521312713623, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.19831790030002594, + "step": 7574 + }, + { + "epoch": 0.4735, + "grad_norm": 2.28125, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 7.0325, + "loss/crossentropy": 1.871632695198059, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19257056713104248, + "step": 7576 + }, + { + "epoch": 0.473625, + "grad_norm": 1.953125, + "grad_norm_var": 0.017496744791666668, + "learning_rate": 0.0001, + "loss": 7.165, + "loss/crossentropy": 2.2190489768981934, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20526662468910217, + "step": 7578 + }, + { + "epoch": 0.47375, + "grad_norm": 2.109375, + "grad_norm_var": 0.017316691080729165, + "learning_rate": 0.0001, + "loss": 7.3, + "loss/crossentropy": 2.4316731691360474, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21631304919719696, + "step": 7580 + }, + { + "epoch": 0.473875, + "grad_norm": 2.0625, + "grad_norm_var": 0.009273274739583334, + "learning_rate": 0.0001, + "loss": 7.1383, + "loss/crossentropy": 2.157890558242798, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.18887675553560257, + "step": 7582 + }, + { + "epoch": 0.474, + "grad_norm": 2.015625, + "grad_norm_var": 0.007763671875, + "learning_rate": 0.0001, + "loss": 7.1594, + "loss/crossentropy": 2.0798093676567078, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2017417550086975, + "step": 7584 + }, + { + "epoch": 0.474125, + "grad_norm": 2.4375, + "grad_norm_var": 0.01455078125, + "learning_rate": 0.0001, + "loss": 7.2126, + "loss/crossentropy": 2.38494336605072, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.22561874240636826, + "step": 7586 + }, + { + "epoch": 0.47425, + "grad_norm": 2.125, + "grad_norm_var": 0.018294270833333334, + "learning_rate": 0.0001, + "loss": 7.2926, + "loss/crossentropy": 2.4800193309783936, + "loss/hidden": 2.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2282564714550972, + "step": 7588 + }, + { + "epoch": 0.474375, + "grad_norm": 2.28125, + "grad_norm_var": 0.02139460245768229, + "learning_rate": 0.0001, + "loss": 7.0307, + "loss/crossentropy": 2.2453829050064087, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21037797629833221, + "step": 7590 + }, + { + "epoch": 0.4745, + "grad_norm": 2.28125, + "grad_norm_var": 0.021469879150390624, + "learning_rate": 0.0001, + "loss": 7.208, + "loss/crossentropy": 2.2749218940734863, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2083357498049736, + "step": 7592 + }, + { + "epoch": 0.474625, + "grad_norm": 2.015625, + "grad_norm_var": 0.019608306884765624, + "learning_rate": 0.0001, + "loss": 7.0957, + "loss/crossentropy": 2.182799458503723, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.18513298779726028, + "step": 7594 + }, + { + "epoch": 0.47475, + "grad_norm": 2.0625, + "grad_norm_var": 0.020304107666015626, + "learning_rate": 0.0001, + "loss": 7.0361, + "loss/crossentropy": 2.3911205530166626, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20220442116260529, + "step": 7596 + }, + { + "epoch": 0.474875, + "grad_norm": 2.078125, + "grad_norm_var": 0.019712066650390624, + "learning_rate": 0.0001, + "loss": 7.2206, + "loss/crossentropy": 2.3215757608413696, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2056894525885582, + "step": 7598 + }, + { + "epoch": 0.475, + "grad_norm": 2.015625, + "grad_norm_var": 0.019589996337890624, + "learning_rate": 0.0001, + "loss": 7.1877, + "loss/crossentropy": 2.3881407976150513, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2328326404094696, + "step": 7600 + }, + { + "epoch": 0.475125, + "grad_norm": 2.0625, + "grad_norm_var": 0.01568578084309896, + "learning_rate": 0.0001, + "loss": 6.955, + "loss/crossentropy": 2.414221405982971, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20993871241807938, + "step": 7602 + }, + { + "epoch": 0.47525, + "grad_norm": 2.296875, + "grad_norm_var": 0.012276204427083333, + "learning_rate": 0.0001, + "loss": 7.0465, + "loss/crossentropy": 2.051502525806427, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.18645334243774414, + "step": 7604 + }, + { + "epoch": 0.475375, + "grad_norm": 2.109375, + "grad_norm_var": 0.010241444905598958, + "learning_rate": 0.0001, + "loss": 7.1153, + "loss/crossentropy": 2.0045130848884583, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.206765778362751, + "step": 7606 + }, + { + "epoch": 0.4755, + "grad_norm": 2.046875, + "grad_norm_var": 0.008538564046223959, + "learning_rate": 0.0001, + "loss": 7.0222, + "loss/crossentropy": 2.3240236043930054, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.19366633147001266, + "step": 7608 + }, + { + "epoch": 0.475625, + "grad_norm": 2.171875, + "grad_norm_var": 0.007956695556640626, + "learning_rate": 0.0001, + "loss": 7.0812, + "loss/crossentropy": 1.9922118186950684, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.20807665586471558, + "step": 7610 + }, + { + "epoch": 0.47575, + "grad_norm": 2.203125, + "grad_norm_var": 0.010909016927083333, + "learning_rate": 0.0001, + "loss": 7.0856, + "loss/crossentropy": 1.825062870979309, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.1701940894126892, + "step": 7612 + }, + { + "epoch": 0.475875, + "grad_norm": 2.09375, + "grad_norm_var": 0.0108642578125, + "learning_rate": 0.0001, + "loss": 7.2186, + "loss/crossentropy": 2.29690420627594, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20619598031044006, + "step": 7614 + }, + { + "epoch": 0.476, + "grad_norm": 2.109375, + "grad_norm_var": 0.010941569010416667, + "learning_rate": 0.0001, + "loss": 7.2496, + "loss/crossentropy": 2.193692684173584, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20736318081617355, + "step": 7616 + }, + { + "epoch": 0.476125, + "grad_norm": 2.078125, + "grad_norm_var": 0.010741170247395833, + "learning_rate": 0.0001, + "loss": 7.0215, + "loss/crossentropy": 2.1778957843780518, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20710615813732147, + "step": 7618 + }, + { + "epoch": 0.47625, + "grad_norm": 1.9375, + "grad_norm_var": 0.010237375895182291, + "learning_rate": 0.0001, + "loss": 7.0398, + "loss/crossentropy": 2.1610811948776245, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19665740430355072, + "step": 7620 + }, + { + "epoch": 0.476375, + "grad_norm": 2.09375, + "grad_norm_var": 0.008137766520182292, + "learning_rate": 0.0001, + "loss": 7.1509, + "loss/crossentropy": 2.0157353281974792, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.19302255660295486, + "step": 7622 + }, + { + "epoch": 0.4765, + "grad_norm": 2.0625, + "grad_norm_var": 0.007816314697265625, + "learning_rate": 0.0001, + "loss": 7.122, + "loss/crossentropy": 2.2813684940338135, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.20449642091989517, + "step": 7624 + }, + { + "epoch": 0.476625, + "grad_norm": 2.078125, + "grad_norm_var": 0.007500966389973958, + "learning_rate": 0.0001, + "loss": 6.9673, + "loss/crossentropy": 2.1194839477539062, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19238164275884628, + "step": 7626 + }, + { + "epoch": 0.47675, + "grad_norm": 2.640625, + "grad_norm_var": 0.02554931640625, + "learning_rate": 0.0001, + "loss": 7.1515, + "loss/crossentropy": 2.177587628364563, + "loss/hidden": 2.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.207644984126091, + "step": 7628 + }, + { + "epoch": 0.476875, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02942479451497396, + "learning_rate": 0.0001, + "loss": 7.0191, + "loss/crossentropy": 2.3335689306259155, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21252815425395966, + "step": 7630 + }, + { + "epoch": 0.477, + "grad_norm": 2.09375, + "grad_norm_var": 0.02950007120768229, + "learning_rate": 0.0001, + "loss": 7.1822, + "loss/crossentropy": 2.325187921524048, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21348048746585846, + "step": 7632 + }, + { + "epoch": 0.477125, + "grad_norm": 2.0625, + "grad_norm_var": 0.029325103759765624, + "learning_rate": 0.0001, + "loss": 7.1423, + "loss/crossentropy": 2.287824869155884, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20349889993667603, + "step": 7634 + }, + { + "epoch": 0.47725, + "grad_norm": 2.0, + "grad_norm_var": 0.028574371337890626, + "learning_rate": 0.0001, + "loss": 7.0346, + "loss/crossentropy": 2.2881882190704346, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.1959080845117569, + "step": 7636 + }, + { + "epoch": 0.477375, + "grad_norm": 2.171875, + "grad_norm_var": 0.029842122395833334, + "learning_rate": 0.0001, + "loss": 7.1162, + "loss/crossentropy": 2.3250468969345093, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.1951511949300766, + "step": 7638 + }, + { + "epoch": 0.4775, + "grad_norm": 2.234375, + "grad_norm_var": 0.030036417643229167, + "learning_rate": 0.0001, + "loss": 7.3131, + "loss/crossentropy": 2.1652873754501343, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20676423609256744, + "step": 7640 + }, + { + "epoch": 0.477625, + "grad_norm": 2.109375, + "grad_norm_var": 0.0290924072265625, + "learning_rate": 0.0001, + "loss": 7.1448, + "loss/crossentropy": 2.1286062598228455, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20646179467439651, + "step": 7642 + }, + { + "epoch": 0.47775, + "grad_norm": 2.109375, + "grad_norm_var": 0.0110107421875, + "learning_rate": 0.0001, + "loss": 7.0397, + "loss/crossentropy": 2.2193092107772827, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20696381479501724, + "step": 7644 + }, + { + "epoch": 0.477875, + "grad_norm": 2.125, + "grad_norm_var": 0.006876373291015625, + "learning_rate": 0.0001, + "loss": 7.2361, + "loss/crossentropy": 2.1308083534240723, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.1990327164530754, + "step": 7646 + }, + { + "epoch": 0.478, + "grad_norm": 2.03125, + "grad_norm_var": 0.006748199462890625, + "learning_rate": 0.0001, + "loss": 7.2022, + "loss/crossentropy": 2.4250513315200806, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.22592860460281372, + "step": 7648 + }, + { + "epoch": 0.478125, + "grad_norm": 2.140625, + "grad_norm_var": 0.006650543212890625, + "learning_rate": 0.0001, + "loss": 7.1834, + "loss/crossentropy": 2.297890782356262, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2043851688504219, + "step": 7650 + }, + { + "epoch": 0.47825, + "grad_norm": 2.375, + "grad_norm_var": 0.011191558837890626, + "learning_rate": 0.0001, + "loss": 6.9906, + "loss/crossentropy": 2.03427255153656, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.18367985635995865, + "step": 7652 + }, + { + "epoch": 0.478375, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015474192301432292, + "learning_rate": 0.0001, + "loss": 7.0992, + "loss/crossentropy": 2.3343425989151, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.210361510515213, + "step": 7654 + }, + { + "epoch": 0.4785, + "grad_norm": 2.109375, + "grad_norm_var": 0.015152740478515624, + "learning_rate": 0.0001, + "loss": 6.9502, + "loss/crossentropy": 2.5432735681533813, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2085108458995819, + "step": 7656 + }, + { + "epoch": 0.478625, + "grad_norm": 2.046875, + "grad_norm_var": 0.014745839436848958, + "learning_rate": 0.0001, + "loss": 7.1265, + "loss/crossentropy": 2.376760482788086, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21046151965856552, + "step": 7658 + }, + { + "epoch": 0.47875, + "grad_norm": 2.109375, + "grad_norm_var": 0.014564768473307291, + "learning_rate": 0.0001, + "loss": 7.091, + "loss/crossentropy": 2.1871683597564697, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20009056478738785, + "step": 7660 + }, + { + "epoch": 0.478875, + "grad_norm": 2.15625, + "grad_norm_var": 0.015600331624348958, + "learning_rate": 0.0001, + "loss": 7.0751, + "loss/crossentropy": 2.1381759643554688, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20151401311159134, + "step": 7662 + }, + { + "epoch": 0.479, + "grad_norm": 2.078125, + "grad_norm_var": 0.015228017171223959, + "learning_rate": 0.0001, + "loss": 7.1749, + "loss/crossentropy": 2.204997420310974, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20210929214954376, + "step": 7664 + }, + { + "epoch": 0.479125, + "grad_norm": 2.234375, + "grad_norm_var": 0.021683502197265624, + "learning_rate": 0.0001, + "loss": 7.1606, + "loss/crossentropy": 2.5145864486694336, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.24845977127552032, + "step": 7666 + }, + { + "epoch": 0.47925, + "grad_norm": 2.125, + "grad_norm_var": 0.014534250895182291, + "learning_rate": 0.0001, + "loss": 7.0797, + "loss/crossentropy": 2.1923307180404663, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19743437319993973, + "step": 7668 + }, + { + "epoch": 0.479375, + "grad_norm": 2.0625, + "grad_norm_var": 0.010326131184895834, + "learning_rate": 0.0001, + "loss": 7.1655, + "loss/crossentropy": 2.2290977239608765, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.18826957046985626, + "step": 7670 + }, + { + "epoch": 0.4795, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0140777587890625, + "learning_rate": 0.0001, + "loss": 7.0097, + "loss/crossentropy": 2.1928118467330933, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.20673301815986633, + "step": 7672 + }, + { + "epoch": 0.479625, + "grad_norm": 2.1875, + "grad_norm_var": 0.013825480143229167, + "learning_rate": 0.0001, + "loss": 7.1607, + "loss/crossentropy": 2.399188995361328, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21841491758823395, + "step": 7674 + }, + { + "epoch": 0.47975, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01598078409830729, + "learning_rate": 0.0001, + "loss": 7.1132, + "loss/crossentropy": 2.001654326915741, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.17933339625597, + "step": 7676 + }, + { + "epoch": 0.479875, + "grad_norm": 2.015625, + "grad_norm_var": 0.01590550740559896, + "learning_rate": 0.0001, + "loss": 7.0738, + "loss/crossentropy": 2.2801817655563354, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19666346907615662, + "step": 7678 + }, + { + "epoch": 0.48, + "grad_norm": 2.109375, + "grad_norm_var": 0.015582021077473958, + "learning_rate": 0.0001, + "loss": 7.1026, + "loss/crossentropy": 2.081633687019348, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.19271892309188843, + "step": 7680 + }, + { + "epoch": 0.480125, + "grad_norm": 2.203125, + "grad_norm_var": 0.008813222249348959, + "learning_rate": 0.0001, + "loss": 7.211, + "loss/crossentropy": 2.392168879508972, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.18299797177314758, + "step": 7682 + }, + { + "epoch": 0.48025, + "grad_norm": 2.140625, + "grad_norm_var": 0.010147857666015624, + "learning_rate": 0.0001, + "loss": 7.3303, + "loss/crossentropy": 2.3119935989379883, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.23662632703781128, + "step": 7684 + }, + { + "epoch": 0.480375, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009919230143229167, + "learning_rate": 0.0001, + "loss": 7.0347, + "loss/crossentropy": 2.2318339347839355, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.21657077223062515, + "step": 7686 + }, + { + "epoch": 0.4805, + "grad_norm": 2.0625, + "grad_norm_var": 0.007819620768229167, + "learning_rate": 0.0001, + "loss": 6.9663, + "loss/crossentropy": 2.1833382844924927, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.21761953830718994, + "step": 7688 + }, + { + "epoch": 0.480625, + "grad_norm": 2.109375, + "grad_norm_var": 0.006322224934895833, + "learning_rate": 0.0001, + "loss": 7.0488, + "loss/crossentropy": 2.134721040725708, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.1840914860367775, + "step": 7690 + }, + { + "epoch": 0.48075, + "grad_norm": 2.140625, + "grad_norm_var": 0.006231435139973958, + "learning_rate": 0.0001, + "loss": 7.0641, + "loss/crossentropy": 2.2247310876846313, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22055795043706894, + "step": 7692 + }, + { + "epoch": 0.480875, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007892862955729166, + "learning_rate": 0.0001, + "loss": 7.0667, + "loss/crossentropy": 2.27902615070343, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21320205926895142, + "step": 7694 + }, + { + "epoch": 0.481, + "grad_norm": 2.0625, + "grad_norm_var": 0.007950846354166667, + "learning_rate": 0.0001, + "loss": 7.0584, + "loss/crossentropy": 2.0421899557113647, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.1877262145280838, + "step": 7696 + }, + { + "epoch": 0.481125, + "grad_norm": 2.15625, + "grad_norm_var": 0.0074615478515625, + "learning_rate": 0.0001, + "loss": 7.0576, + "loss/crossentropy": 2.2370803356170654, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20086784660816193, + "step": 7698 + }, + { + "epoch": 0.48125, + "grad_norm": 2.1875, + "grad_norm_var": 0.0062652587890625, + "learning_rate": 0.0001, + "loss": 7.141, + "loss/crossentropy": 2.2639442682266235, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20139917731285095, + "step": 7700 + }, + { + "epoch": 0.481375, + "grad_norm": 2.03125, + "grad_norm_var": 0.005944569905598958, + "learning_rate": 0.0001, + "loss": 7.0808, + "loss/crossentropy": 2.308246374130249, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2012246772646904, + "step": 7702 + }, + { + "epoch": 0.4815, + "grad_norm": 2.25, + "grad_norm_var": 0.007452138264973958, + "learning_rate": 0.0001, + "loss": 7.1243, + "loss/crossentropy": 2.406078815460205, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20916583389043808, + "step": 7704 + }, + { + "epoch": 0.481625, + "grad_norm": 2.40625, + "grad_norm_var": 0.013730621337890625, + "learning_rate": 0.0001, + "loss": 7.0034, + "loss/crossentropy": 2.3612505197525024, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2026866227388382, + "step": 7706 + }, + { + "epoch": 0.48175, + "grad_norm": 2.015625, + "grad_norm_var": 0.014928944905598958, + "learning_rate": 0.0001, + "loss": 7.0391, + "loss/crossentropy": 2.1469807028770447, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.1992548406124115, + "step": 7708 + }, + { + "epoch": 0.481875, + "grad_norm": 2.078125, + "grad_norm_var": 0.012711588541666667, + "learning_rate": 0.0001, + "loss": 7.1218, + "loss/crossentropy": 2.0821534991264343, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2098289653658867, + "step": 7710 + }, + { + "epoch": 0.482, + "grad_norm": 2.28125, + "grad_norm_var": 0.015070597330729166, + "learning_rate": 0.0001, + "loss": 7.1686, + "loss/crossentropy": 2.180022358894348, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.21401400864124298, + "step": 7712 + }, + { + "epoch": 0.482125, + "grad_norm": 2.078125, + "grad_norm_var": 0.014644368489583334, + "learning_rate": 0.0001, + "loss": 7.1485, + "loss/crossentropy": 2.3946781158447266, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22900524735450745, + "step": 7714 + }, + { + "epoch": 0.48225, + "grad_norm": 2.140625, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 7.1556, + "loss/crossentropy": 2.135510504245758, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21871786564588547, + "step": 7716 + }, + { + "epoch": 0.482375, + "grad_norm": 2.140625, + "grad_norm_var": 0.014777628580729167, + "learning_rate": 0.0001, + "loss": 7.2415, + "loss/crossentropy": 2.286018133163452, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21493077278137207, + "step": 7718 + }, + { + "epoch": 0.4825, + "grad_norm": 2.171875, + "grad_norm_var": 0.017748006184895835, + "learning_rate": 0.0001, + "loss": 7.0218, + "loss/crossentropy": 2.0190805792808533, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20633917301893234, + "step": 7720 + }, + { + "epoch": 0.482625, + "grad_norm": 2.03125, + "grad_norm_var": 0.0145660400390625, + "learning_rate": 0.0001, + "loss": 7.1712, + "loss/crossentropy": 2.0265942811965942, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20769783109426498, + "step": 7722 + }, + { + "epoch": 0.48275, + "grad_norm": 2.0625, + "grad_norm_var": 0.011454264322916666, + "learning_rate": 0.0001, + "loss": 6.9481, + "loss/crossentropy": 2.119886040687561, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19638077169656754, + "step": 7724 + }, + { + "epoch": 0.482875, + "grad_norm": 2.125, + "grad_norm_var": 0.009696451822916667, + "learning_rate": 0.0001, + "loss": 7.1273, + "loss/crossentropy": 2.1580886840820312, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.1923273205757141, + "step": 7726 + }, + { + "epoch": 0.483, + "grad_norm": 2.0625, + "grad_norm_var": 0.009137980143229167, + "learning_rate": 0.0001, + "loss": 7.0008, + "loss/crossentropy": 2.1663527488708496, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20339468121528625, + "step": 7728 + }, + { + "epoch": 0.483125, + "grad_norm": 2.125, + "grad_norm_var": 0.008576456705729167, + "learning_rate": 0.0001, + "loss": 7.2413, + "loss/crossentropy": 2.182821273803711, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21904577314853668, + "step": 7730 + }, + { + "epoch": 0.48325, + "grad_norm": 2.078125, + "grad_norm_var": 0.008919270833333333, + "learning_rate": 0.0001, + "loss": 7.1494, + "loss/crossentropy": 2.373465061187744, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2193896695971489, + "step": 7732 + }, + { + "epoch": 0.483375, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0124176025390625, + "learning_rate": 0.0001, + "loss": 6.9891, + "loss/crossentropy": 2.6929373741149902, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21133850514888763, + "step": 7734 + }, + { + "epoch": 0.4835, + "grad_norm": 2.09375, + "grad_norm_var": 0.0075185139973958336, + "learning_rate": 0.0001, + "loss": 7.241, + "loss/crossentropy": 2.153095006942749, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2049623802304268, + "step": 7736 + }, + { + "epoch": 0.483625, + "grad_norm": 2.328125, + "grad_norm_var": 0.010358683268229167, + "learning_rate": 0.0001, + "loss": 7.2583, + "loss/crossentropy": 2.349102020263672, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20283705741167068, + "step": 7738 + }, + { + "epoch": 0.48375, + "grad_norm": 2.0, + "grad_norm_var": 0.0125152587890625, + "learning_rate": 0.0001, + "loss": 7.0513, + "loss/crossentropy": 2.5229681730270386, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.22544901072978973, + "step": 7740 + }, + { + "epoch": 0.483875, + "grad_norm": 2.078125, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 7.2253, + "loss/crossentropy": 2.4761338233947754, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20555153489112854, + "step": 7742 + }, + { + "epoch": 0.484, + "grad_norm": 2.171875, + "grad_norm_var": 0.016153971354166668, + "learning_rate": 0.0001, + "loss": 7.1065, + "loss/crossentropy": 2.6251370906829834, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2124454379081726, + "step": 7744 + }, + { + "epoch": 0.484125, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02028376261393229, + "learning_rate": 0.0001, + "loss": 6.936, + "loss/crossentropy": 2.269635319709778, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.21971495449543, + "step": 7746 + }, + { + "epoch": 0.48425, + "grad_norm": 2.171875, + "grad_norm_var": 0.02039159138997396, + "learning_rate": 0.0001, + "loss": 7.0399, + "loss/crossentropy": 1.855255365371704, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.17584873735904694, + "step": 7748 + }, + { + "epoch": 0.484375, + "grad_norm": 2.265625, + "grad_norm_var": 0.017459869384765625, + "learning_rate": 0.0001, + "loss": 7.079, + "loss/crossentropy": 2.198126792907715, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20961421728134155, + "step": 7750 + }, + { + "epoch": 0.4845, + "grad_norm": 2.125, + "grad_norm_var": 0.016137440999348957, + "learning_rate": 0.0001, + "loss": 7.0269, + "loss/crossentropy": 1.967193365097046, + "loss/hidden": 2.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.19744029641151428, + "step": 7752 + }, + { + "epoch": 0.484625, + "grad_norm": 2.03125, + "grad_norm_var": 0.014597320556640625, + "learning_rate": 0.0001, + "loss": 7.0634, + "loss/crossentropy": 2.325754761695862, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.20395881682634354, + "step": 7754 + }, + { + "epoch": 0.48475, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014699045817057292, + "learning_rate": 0.0001, + "loss": 7.0538, + "loss/crossentropy": 2.288987159729004, + "loss/hidden": 2.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2305854633450508, + "step": 7756 + }, + { + "epoch": 0.484875, + "grad_norm": 2.234375, + "grad_norm_var": 0.015207672119140625, + "learning_rate": 0.0001, + "loss": 7.3086, + "loss/crossentropy": 2.116866946220398, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21432197093963623, + "step": 7758 + }, + { + "epoch": 0.485, + "grad_norm": 2.203125, + "grad_norm_var": 0.011791737874348958, + "learning_rate": 0.0001, + "loss": 7.2918, + "loss/crossentropy": 2.250456213951111, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.22191955149173737, + "step": 7760 + }, + { + "epoch": 0.485125, + "grad_norm": 2.140625, + "grad_norm_var": 0.011043294270833334, + "learning_rate": 0.0001, + "loss": 7.1073, + "loss/crossentropy": 2.2381151914596558, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20548705756664276, + "step": 7762 + }, + { + "epoch": 0.48525, + "grad_norm": 2.203125, + "grad_norm_var": 0.011278279622395833, + "learning_rate": 0.0001, + "loss": 7.0467, + "loss/crossentropy": 2.271865129470825, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20705430209636688, + "step": 7764 + }, + { + "epoch": 0.485375, + "grad_norm": 2.28125, + "grad_norm_var": 0.0121978759765625, + "learning_rate": 0.0001, + "loss": 7.1154, + "loss/crossentropy": 2.157663583755493, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.19460859149694443, + "step": 7766 + }, + { + "epoch": 0.4855, + "grad_norm": 2.171875, + "grad_norm_var": 0.0123779296875, + "learning_rate": 0.0001, + "loss": 7.1723, + "loss/crossentropy": 2.1781057119369507, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.221711665391922, + "step": 7768 + }, + { + "epoch": 0.485625, + "grad_norm": 2.078125, + "grad_norm_var": 0.0119049072265625, + "learning_rate": 0.0001, + "loss": 7.0052, + "loss/crossentropy": 2.017356812953949, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.1983555629849434, + "step": 7770 + }, + { + "epoch": 0.48575, + "grad_norm": 2.140625, + "grad_norm_var": 0.011551920572916667, + "learning_rate": 0.0001, + "loss": 7.0771, + "loss/crossentropy": 2.2357795238494873, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20853830128908157, + "step": 7772 + }, + { + "epoch": 0.485875, + "grad_norm": 2.34375, + "grad_norm_var": 0.016551717122395834, + "learning_rate": 0.0001, + "loss": 7.1366, + "loss/crossentropy": 2.2986165285110474, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22572917491197586, + "step": 7774 + }, + { + "epoch": 0.486, + "grad_norm": 2.21875, + "grad_norm_var": 0.01666259765625, + "learning_rate": 0.0001, + "loss": 7.2861, + "loss/crossentropy": 2.2475717067718506, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21900728344917297, + "step": 7776 + }, + { + "epoch": 0.486125, + "grad_norm": 2.0, + "grad_norm_var": 0.015201822916666666, + "learning_rate": 0.0001, + "loss": 7.047, + "loss/crossentropy": 2.310316801071167, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.20573000609874725, + "step": 7778 + }, + { + "epoch": 0.48625, + "grad_norm": 2.21875, + "grad_norm_var": 0.0154449462890625, + "learning_rate": 0.0001, + "loss": 7.0287, + "loss/crossentropy": 1.9829546213150024, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2091880813241005, + "step": 7780 + }, + { + "epoch": 0.486375, + "grad_norm": 1.953125, + "grad_norm_var": 0.017392730712890624, + "learning_rate": 0.0001, + "loss": 6.9216, + "loss/crossentropy": 2.3593095541000366, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.19612394273281097, + "step": 7782 + }, + { + "epoch": 0.4865, + "grad_norm": 2.25, + "grad_norm_var": 0.020623524983723957, + "learning_rate": 0.0001, + "loss": 7.1751, + "loss/crossentropy": 2.493327498435974, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.23639556765556335, + "step": 7784 + }, + { + "epoch": 0.486625, + "grad_norm": 2.15625, + "grad_norm_var": 0.024253082275390626, + "learning_rate": 0.0001, + "loss": 7.0933, + "loss/crossentropy": 2.37237012386322, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2250986099243164, + "step": 7786 + }, + { + "epoch": 0.48675, + "grad_norm": 2.03125, + "grad_norm_var": 0.022658030192057293, + "learning_rate": 0.0001, + "loss": 7.1574, + "loss/crossentropy": 2.3521467447280884, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21240167319774628, + "step": 7788 + }, + { + "epoch": 0.486875, + "grad_norm": 2.234375, + "grad_norm_var": 0.016200510660807292, + "learning_rate": 0.0001, + "loss": 7.0604, + "loss/crossentropy": 2.2158159017562866, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.19502168148756027, + "step": 7790 + }, + { + "epoch": 0.487, + "grad_norm": 2.046875, + "grad_norm_var": 0.015488433837890624, + "learning_rate": 0.0001, + "loss": 7.2302, + "loss/crossentropy": 2.2819695472717285, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2108946293592453, + "step": 7792 + }, + { + "epoch": 0.487125, + "grad_norm": 2.015625, + "grad_norm_var": 0.015295155843098958, + "learning_rate": 0.0001, + "loss": 7.0113, + "loss/crossentropy": 2.2795369625091553, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.20005237311124802, + "step": 7794 + }, + { + "epoch": 0.48725, + "grad_norm": 2.203125, + "grad_norm_var": 0.015087636311848958, + "learning_rate": 0.0001, + "loss": 7.1739, + "loss/crossentropy": 2.5151994228363037, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.20011039823293686, + "step": 7796 + }, + { + "epoch": 0.487375, + "grad_norm": 2.171875, + "grad_norm_var": 0.013590494791666666, + "learning_rate": 0.0001, + "loss": 7.1157, + "loss/crossentropy": 2.0404099225997925, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21494626998901367, + "step": 7798 + }, + { + "epoch": 0.4875, + "grad_norm": 2.046875, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 7.0035, + "loss/crossentropy": 2.205460011959076, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20000936090946198, + "step": 7800 + }, + { + "epoch": 0.487625, + "grad_norm": 2.046875, + "grad_norm_var": 0.008210245768229167, + "learning_rate": 0.0001, + "loss": 7.0885, + "loss/crossentropy": 2.253046751022339, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.19893667101860046, + "step": 7802 + }, + { + "epoch": 0.48775, + "grad_norm": 2.3125, + "grad_norm_var": 0.010838826497395834, + "learning_rate": 0.0001, + "loss": 7.1092, + "loss/crossentropy": 2.250741481781006, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.19479040056467056, + "step": 7804 + }, + { + "epoch": 0.487875, + "grad_norm": 2.0625, + "grad_norm_var": 0.010530598958333333, + "learning_rate": 0.0001, + "loss": 7.0907, + "loss/crossentropy": 2.1254579424858093, + "loss/hidden": 2.875, + "loss/jsd": 0.0, + "loss/logits": 0.2081373631954193, + "step": 7806 + }, + { + "epoch": 0.488, + "grad_norm": 2.0625, + "grad_norm_var": 0.01021728515625, + "learning_rate": 0.0001, + "loss": 6.9847, + "loss/crossentropy": 2.2556028366088867, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.1956285983324051, + "step": 7808 + }, + { + "epoch": 0.488125, + "grad_norm": 2.265625, + "grad_norm_var": 0.010749308268229167, + "learning_rate": 0.0001, + "loss": 6.8772, + "loss/crossentropy": 2.0442580580711365, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.1880321502685547, + "step": 7810 + }, + { + "epoch": 0.48825, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011329905192057291, + "learning_rate": 0.0001, + "loss": 6.9359, + "loss/crossentropy": 1.964568853378296, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21166903525590897, + "step": 7812 + }, + { + "epoch": 0.488375, + "grad_norm": 2.234375, + "grad_norm_var": 0.010786692301432291, + "learning_rate": 0.0001, + "loss": 7.1827, + "loss/crossentropy": 2.41512668132782, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.20831845700740814, + "step": 7814 + }, + { + "epoch": 0.4885, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01175537109375, + "learning_rate": 0.0001, + "loss": 7.1273, + "loss/crossentropy": 2.370983600616455, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.20832911878824234, + "step": 7816 + }, + { + "epoch": 0.488625, + "grad_norm": 2.140625, + "grad_norm_var": 0.0101806640625, + "learning_rate": 0.0001, + "loss": 7.0815, + "loss/crossentropy": 2.3157626390457153, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.1933603137731552, + "step": 7818 + }, + { + "epoch": 0.48875, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008310699462890625, + "learning_rate": 0.0001, + "loss": 6.9595, + "loss/crossentropy": 2.01755154132843, + "loss/hidden": 2.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.18479111790657043, + "step": 7820 + }, + { + "epoch": 0.488875, + "grad_norm": 2.140625, + "grad_norm_var": 0.008038075764973958, + "learning_rate": 0.0001, + "loss": 7.0953, + "loss/crossentropy": 2.116927742958069, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.1958414539694786, + "step": 7822 + }, + { + "epoch": 0.489, + "grad_norm": 2.109375, + "grad_norm_var": 0.007977040608723958, + "learning_rate": 0.0001, + "loss": 7.1639, + "loss/crossentropy": 2.151541829109192, + "loss/hidden": 2.71875, + "loss/jsd": 0.0, + "loss/logits": 0.1838311031460762, + "step": 7824 + }, + { + "epoch": 0.489125, + "grad_norm": 2.125, + "grad_norm_var": 0.0060727437337239586, + "learning_rate": 0.0001, + "loss": 7.053, + "loss/crossentropy": 2.4051939249038696, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20532245934009552, + "step": 7826 + }, + { + "epoch": 0.48925, + "grad_norm": 2.390625, + "grad_norm_var": 0.013841756184895833, + "learning_rate": 0.0001, + "loss": 7.1092, + "loss/crossentropy": 2.3758574724197388, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2034212425351143, + "step": 7828 + }, + { + "epoch": 0.489375, + "grad_norm": 2.015625, + "grad_norm_var": 0.014408365885416666, + "learning_rate": 0.0001, + "loss": 7.1397, + "loss/crossentropy": 2.2068817019462585, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.20511388778686523, + "step": 7830 + }, + { + "epoch": 0.4895, + "grad_norm": 2.171875, + "grad_norm_var": 0.013342030843098958, + "learning_rate": 0.0001, + "loss": 7.1086, + "loss/crossentropy": 2.3683449029922485, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.22085585445165634, + "step": 7832 + }, + { + "epoch": 0.489625, + "grad_norm": 2.015625, + "grad_norm_var": 0.016015370686848957, + "learning_rate": 0.0001, + "loss": 7.0163, + "loss/crossentropy": 2.137399196624756, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.18337388336658478, + "step": 7834 + }, + { + "epoch": 0.48975, + "grad_norm": 2.125, + "grad_norm_var": 0.014411417643229167, + "learning_rate": 0.0001, + "loss": 7.1261, + "loss/crossentropy": 2.2942042350769043, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2159949615597725, + "step": 7836 + }, + { + "epoch": 0.489875, + "grad_norm": 2.171875, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 7.1013, + "loss/crossentropy": 2.302956461906433, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.22014276683330536, + "step": 7838 + }, + { + "epoch": 0.49, + "grad_norm": 2.0625, + "grad_norm_var": 0.015511067708333333, + "learning_rate": 0.0001, + "loss": 7.0424, + "loss/crossentropy": 2.1671026945114136, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.18992354720830917, + "step": 7840 + }, + { + "epoch": 0.490125, + "grad_norm": 1.9453125, + "grad_norm_var": 0.017622629801432293, + "learning_rate": 0.0001, + "loss": 7.0956, + "loss/crossentropy": 2.034866750240326, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.19620046019554138, + "step": 7842 + }, + { + "epoch": 0.49025, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010016886393229167, + "learning_rate": 0.0001, + "loss": 7.1249, + "loss/crossentropy": 2.373619318008423, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2142253741621971, + "step": 7844 + }, + { + "epoch": 0.490375, + "grad_norm": 2.453125, + "grad_norm_var": 0.018211873372395833, + "learning_rate": 0.0001, + "loss": 7.1376, + "loss/crossentropy": 2.135084390640259, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.20321637392044067, + "step": 7846 + }, + { + "epoch": 0.4905, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02012914021809896, + "learning_rate": 0.0001, + "loss": 7.0496, + "loss/crossentropy": 2.344667911529541, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2073012888431549, + "step": 7848 + }, + { + "epoch": 0.490625, + "grad_norm": 2.203125, + "grad_norm_var": 0.018790435791015626, + "learning_rate": 0.0001, + "loss": 6.8819, + "loss/crossentropy": 2.205102324485779, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.19368459284305573, + "step": 7850 + }, + { + "epoch": 0.49075, + "grad_norm": 1.96875, + "grad_norm_var": 0.01999079386393229, + "learning_rate": 0.0001, + "loss": 6.962, + "loss/crossentropy": 2.0952929258346558, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.19991254806518555, + "step": 7852 + }, + { + "epoch": 0.490875, + "grad_norm": 2.078125, + "grad_norm_var": 0.02063776652018229, + "learning_rate": 0.0001, + "loss": 7.0236, + "loss/crossentropy": 2.091593384742737, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20015785843133926, + "step": 7854 + }, + { + "epoch": 0.491, + "grad_norm": 2.203125, + "grad_norm_var": 0.019758860270182293, + "learning_rate": 0.0001, + "loss": 7.1106, + "loss/crossentropy": 2.1102964878082275, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20344851911067963, + "step": 7856 + }, + { + "epoch": 0.491125, + "grad_norm": 2.09375, + "grad_norm_var": 0.020869954427083334, + "learning_rate": 0.0001, + "loss": 7.0628, + "loss/crossentropy": 2.2350775003433228, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20482338964939117, + "step": 7858 + }, + { + "epoch": 0.49125, + "grad_norm": 2.015625, + "grad_norm_var": 0.019779459635416666, + "learning_rate": 0.0001, + "loss": 7.1046, + "loss/crossentropy": 1.9230089783668518, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.19790837168693542, + "step": 7860 + }, + { + "epoch": 0.491375, + "grad_norm": 1.984375, + "grad_norm_var": 0.0128662109375, + "learning_rate": 0.0001, + "loss": 7.0854, + "loss/crossentropy": 2.5659754276275635, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.20197945088148117, + "step": 7862 + }, + { + "epoch": 0.4915, + "grad_norm": 2.328125, + "grad_norm_var": 0.012967681884765625, + "learning_rate": 0.0001, + "loss": 7.2325, + "loss/crossentropy": 2.4076273441314697, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21878042817115784, + "step": 7864 + }, + { + "epoch": 0.491625, + "grad_norm": 2.25, + "grad_norm_var": 0.013685862223307291, + "learning_rate": 0.0001, + "loss": 7.192, + "loss/crossentropy": 2.3655115365982056, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2367181032896042, + "step": 7866 + }, + { + "epoch": 0.49175, + "grad_norm": 2.140625, + "grad_norm_var": 0.03374201456705729, + "learning_rate": 0.0001, + "loss": 7.2599, + "loss/crossentropy": 2.1696566343307495, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2092232033610344, + "step": 7868 + }, + { + "epoch": 0.491875, + "grad_norm": 2.0625, + "grad_norm_var": 0.03148778279622396, + "learning_rate": 0.0001, + "loss": 7.2385, + "loss/crossentropy": 2.376840353012085, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.21353555470705032, + "step": 7870 + }, + { + "epoch": 0.492, + "grad_norm": 2.046875, + "grad_norm_var": 0.03212865193684896, + "learning_rate": 0.0001, + "loss": 7.1475, + "loss/crossentropy": 2.085893988609314, + "loss/hidden": 2.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.20016048848628998, + "step": 7872 + }, + { + "epoch": 0.492125, + "grad_norm": 2.0, + "grad_norm_var": 0.03144912719726563, + "learning_rate": 0.0001, + "loss": 7.0677, + "loss/crossentropy": 2.2020708322525024, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.20381900668144226, + "step": 7874 + }, + { + "epoch": 0.49225, + "grad_norm": 2.140625, + "grad_norm_var": 0.029173787434895834, + "learning_rate": 0.0001, + "loss": 7.123, + "loss/crossentropy": 2.22407865524292, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20843693614006042, + "step": 7876 + }, + { + "epoch": 0.492375, + "grad_norm": 2.15625, + "grad_norm_var": 0.026432291666666666, + "learning_rate": 0.0001, + "loss": 7.2386, + "loss/crossentropy": 2.563064932823181, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.23229733109474182, + "step": 7878 + }, + { + "epoch": 0.4925, + "grad_norm": 2.125, + "grad_norm_var": 0.024800618489583332, + "learning_rate": 0.0001, + "loss": 7.1823, + "loss/crossentropy": 2.2542184591293335, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20725636184215546, + "step": 7880 + }, + { + "epoch": 0.492625, + "grad_norm": 2.171875, + "grad_norm_var": 0.024430338541666666, + "learning_rate": 0.0001, + "loss": 7.051, + "loss/crossentropy": 2.041069507598877, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.20505793392658234, + "step": 7882 + }, + { + "epoch": 0.49275, + "grad_norm": 2.078125, + "grad_norm_var": 0.0036529541015625, + "learning_rate": 0.0001, + "loss": 7.2277, + "loss/crossentropy": 2.295996069908142, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.20132000744342804, + "step": 7884 + }, + { + "epoch": 0.492875, + "grad_norm": 2.828125, + "grad_norm_var": 0.0341949462890625, + "learning_rate": 0.0001, + "loss": 7.2021, + "loss/crossentropy": 2.152729630470276, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21744219958782196, + "step": 7886 + }, + { + "epoch": 0.493, + "grad_norm": 2.125, + "grad_norm_var": 0.033324178059895834, + "learning_rate": 0.0001, + "loss": 7.0682, + "loss/crossentropy": 2.158630609512329, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20471574366092682, + "step": 7888 + }, + { + "epoch": 0.493125, + "grad_norm": 2.234375, + "grad_norm_var": 0.030858357747395832, + "learning_rate": 0.0001, + "loss": 6.9869, + "loss/crossentropy": 2.0892770290374756, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.18370763957500458, + "step": 7890 + }, + { + "epoch": 0.49325, + "grad_norm": 2.03125, + "grad_norm_var": 0.03192952473958333, + "learning_rate": 0.0001, + "loss": 7.1989, + "loss/crossentropy": 2.492920398712158, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.218817800283432, + "step": 7892 + }, + { + "epoch": 0.493375, + "grad_norm": 2.140625, + "grad_norm_var": 0.03240559895833333, + "learning_rate": 0.0001, + "loss": 7.1465, + "loss/crossentropy": 2.3585835695266724, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.203622005879879, + "step": 7894 + }, + { + "epoch": 0.4935, + "grad_norm": 2.265625, + "grad_norm_var": 0.03378499348958333, + "learning_rate": 0.0001, + "loss": 7.1171, + "loss/crossentropy": 2.145370841026306, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.19818150997161865, + "step": 7896 + }, + { + "epoch": 0.493625, + "grad_norm": 2.171875, + "grad_norm_var": 0.03325093587239583, + "learning_rate": 0.0001, + "loss": 7.239, + "loss/crossentropy": 2.408655047416687, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.21892248839139938, + "step": 7898 + }, + { + "epoch": 0.49375, + "grad_norm": 2.125, + "grad_norm_var": 0.03749974568684896, + "learning_rate": 0.0001, + "loss": 7.0329, + "loss/crossentropy": 2.255719542503357, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21674372255802155, + "step": 7900 + }, + { + "epoch": 0.493875, + "grad_norm": 2.171875, + "grad_norm_var": 0.007592519124348958, + "learning_rate": 0.0001, + "loss": 7.1371, + "loss/crossentropy": 2.2832452058792114, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2183477208018303, + "step": 7902 + }, + { + "epoch": 0.494, + "grad_norm": 2.03125, + "grad_norm_var": 0.009100087483723958, + "learning_rate": 0.0001, + "loss": 7.1093, + "loss/crossentropy": 2.5752521753311157, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.22478312999010086, + "step": 7904 + }, + { + "epoch": 0.494125, + "grad_norm": 2.140625, + "grad_norm_var": 0.008141835530598959, + "learning_rate": 0.0001, + "loss": 7.0139, + "loss/crossentropy": 2.229183316230774, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.2103634625673294, + "step": 7906 + }, + { + "epoch": 0.49425, + "grad_norm": 2.078125, + "grad_norm_var": 0.009653472900390625, + "learning_rate": 0.0001, + "loss": 7.1175, + "loss/crossentropy": 2.100042998790741, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.19252710044384003, + "step": 7908 + }, + { + "epoch": 0.494375, + "grad_norm": 2.171875, + "grad_norm_var": 0.013268788655598959, + "learning_rate": 0.0001, + "loss": 7.301, + "loss/crossentropy": 2.3349530696868896, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.22781674563884735, + "step": 7910 + }, + { + "epoch": 0.4945, + "grad_norm": 2.0, + "grad_norm_var": 0.013352203369140624, + "learning_rate": 0.0001, + "loss": 7.0135, + "loss/crossentropy": 2.2375741004943848, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.19559404999017715, + "step": 7912 + }, + { + "epoch": 0.494625, + "grad_norm": 2.109375, + "grad_norm_var": 0.012680816650390624, + "learning_rate": 0.0001, + "loss": 7.093, + "loss/crossentropy": 2.447828769683838, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2270418256521225, + "step": 7914 + }, + { + "epoch": 0.49475, + "grad_norm": 2.28125, + "grad_norm_var": 0.012059529622395834, + "learning_rate": 0.0001, + "loss": 6.9899, + "loss/crossentropy": 2.0814391374588013, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.18889734894037247, + "step": 7916 + }, + { + "epoch": 0.494875, + "grad_norm": 1.96875, + "grad_norm_var": 0.0132476806640625, + "learning_rate": 0.0001, + "loss": 7.1485, + "loss/crossentropy": 2.4870957136154175, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.21650540828704834, + "step": 7918 + }, + { + "epoch": 0.495, + "grad_norm": 2.25, + "grad_norm_var": 0.0121734619140625, + "learning_rate": 0.0001, + "loss": 7.0928, + "loss/crossentropy": 2.1004719734191895, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.19623607397079468, + "step": 7920 + }, + { + "epoch": 0.495125, + "grad_norm": 2.515625, + "grad_norm_var": 0.0206207275390625, + "learning_rate": 0.0001, + "loss": 7.2285, + "loss/crossentropy": 2.592145800590515, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.20861030369997025, + "step": 7922 + }, + { + "epoch": 0.49525, + "grad_norm": 2.03125, + "grad_norm_var": 0.021271769205729166, + "learning_rate": 0.0001, + "loss": 7.0363, + "loss/crossentropy": 2.118270993232727, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.19777195155620575, + "step": 7924 + }, + { + "epoch": 0.495375, + "grad_norm": 2.015625, + "grad_norm_var": 0.0199615478515625, + "learning_rate": 0.0001, + "loss": 6.9582, + "loss/crossentropy": 2.176929235458374, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.21006185561418533, + "step": 7926 + }, + { + "epoch": 0.4955, + "grad_norm": 2.03125, + "grad_norm_var": 0.019074503580729166, + "learning_rate": 0.0001, + "loss": 7.0891, + "loss/crossentropy": 2.425865054130554, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.20186667144298553, + "step": 7928 + }, + { + "epoch": 0.495625, + "grad_norm": 2.046875, + "grad_norm_var": 0.020685831705729168, + "learning_rate": 0.0001, + "loss": 7.0731, + "loss/crossentropy": 2.2788286209106445, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20905498415231705, + "step": 7930 + }, + { + "epoch": 0.49575, + "grad_norm": 2.140625, + "grad_norm_var": 0.019010416666666665, + "learning_rate": 0.0001, + "loss": 7.0084, + "loss/crossentropy": 2.188755750656128, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.21286778151988983, + "step": 7932 + }, + { + "epoch": 0.495875, + "grad_norm": 1.96875, + "grad_norm_var": 0.019416300455729167, + "learning_rate": 0.0001, + "loss": 6.9673, + "loss/crossentropy": 2.3245939016342163, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.22744500637054443, + "step": 7934 + }, + { + "epoch": 0.496, + "grad_norm": 2.265625, + "grad_norm_var": 0.01890869140625, + "learning_rate": 0.0001, + "loss": 7.149, + "loss/crossentropy": 2.371795654296875, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21272078156471252, + "step": 7936 + }, + { + "epoch": 0.496125, + "grad_norm": 1.96875, + "grad_norm_var": 0.007624308268229167, + "learning_rate": 0.0001, + "loss": 6.9848, + "loss/crossentropy": 2.1958614587783813, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21580558270215988, + "step": 7938 + }, + { + "epoch": 0.49625, + "grad_norm": 2.046875, + "grad_norm_var": 0.007666015625, + "learning_rate": 0.0001, + "loss": 7.1961, + "loss/crossentropy": 2.513580083847046, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.202760711312294, + "step": 7940 + }, + { + "epoch": 0.496375, + "grad_norm": 2.0625, + "grad_norm_var": 0.00748291015625, + "learning_rate": 0.0001, + "loss": 7.1847, + "loss/crossentropy": 1.978775680065155, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.19025252759456635, + "step": 7942 + }, + { + "epoch": 0.4965, + "grad_norm": 2.34375, + "grad_norm_var": 0.0118316650390625, + "learning_rate": 0.0001, + "loss": 6.9504, + "loss/crossentropy": 2.2968384623527527, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.21156945079565048, + "step": 7944 + }, + { + "epoch": 0.496625, + "grad_norm": 2.90625, + "grad_norm_var": 0.051774088541666666, + "learning_rate": 0.0001, + "loss": 7.197, + "loss/crossentropy": 2.2490543127059937, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20457617938518524, + "step": 7946 + }, + { + "epoch": 0.49675, + "grad_norm": 2.09375, + "grad_norm_var": 0.051488240559895836, + "learning_rate": 0.0001, + "loss": 7.0335, + "loss/crossentropy": 2.0426313877105713, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.21022196114063263, + "step": 7948 + }, + { + "epoch": 0.496875, + "grad_norm": 2.046875, + "grad_norm_var": 0.0497222900390625, + "learning_rate": 0.0001, + "loss": 7.1089, + "loss/crossentropy": 2.0203962326049805, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.1993423029780388, + "step": 7950 + }, + { + "epoch": 0.497, + "grad_norm": 2.296875, + "grad_norm_var": 0.0502349853515625, + "learning_rate": 0.0001, + "loss": 7.0083, + "loss/crossentropy": 2.5307857990264893, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.21577821671962738, + "step": 7952 + }, + { + "epoch": 0.497125, + "grad_norm": 1.9453125, + "grad_norm_var": 0.05158665974934896, + "learning_rate": 0.0001, + "loss": 7.037, + "loss/crossentropy": 2.324398159980774, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20606698840856552, + "step": 7954 + }, + { + "epoch": 0.49725, + "grad_norm": 2.375, + "grad_norm_var": 0.05232518513997396, + "learning_rate": 0.0001, + "loss": 7.2288, + "loss/crossentropy": 2.4783180952072144, + "loss/hidden": 2.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2216300517320633, + "step": 7956 + }, + { + "epoch": 0.497375, + "grad_norm": 2.359375, + "grad_norm_var": 0.051401519775390626, + "learning_rate": 0.0001, + "loss": 7.213, + "loss/crossentropy": 2.411279320716858, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.21086601167917252, + "step": 7958 + }, + { + "epoch": 0.4975, + "grad_norm": 2.09375, + "grad_norm_var": 0.051092274983723956, + "learning_rate": 0.0001, + "loss": 7.2168, + "loss/crossentropy": 2.1537816524505615, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.20549767464399338, + "step": 7960 + }, + { + "epoch": 0.497625, + "grad_norm": 2.078125, + "grad_norm_var": 0.016281890869140624, + "learning_rate": 0.0001, + "loss": 7.1023, + "loss/crossentropy": 2.394239068031311, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.21313968300819397, + "step": 7962 + }, + { + "epoch": 0.49775, + "grad_norm": 2.125, + "grad_norm_var": 0.016798655192057293, + "learning_rate": 0.0001, + "loss": 7.185, + "loss/crossentropy": 2.42279052734375, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2198297083377838, + "step": 7964 + }, + { + "epoch": 0.497875, + "grad_norm": 2.140625, + "grad_norm_var": 0.01599909464518229, + "learning_rate": 0.0001, + "loss": 7.0906, + "loss/crossentropy": 2.2097359895706177, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.20080310106277466, + "step": 7966 + }, + { + "epoch": 0.498, + "grad_norm": 2.15625, + "grad_norm_var": 0.014574940999348958, + "learning_rate": 0.0001, + "loss": 7.3154, + "loss/crossentropy": 2.3493038415908813, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.19426824152469635, + "step": 7968 + }, + { + "epoch": 0.498125, + "grad_norm": 2.125, + "grad_norm_var": 0.0106597900390625, + "learning_rate": 0.0001, + "loss": 7.1954, + "loss/crossentropy": 2.355955958366394, + "loss/hidden": 2.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2216314971446991, + "step": 7970 + }, + { + "epoch": 0.49825, + "grad_norm": 2.078125, + "grad_norm_var": 0.007811482747395833, + "learning_rate": 0.0001, + "loss": 7.2072, + "loss/crossentropy": 2.63875412940979, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.21204157173633575, + "step": 7972 + }, + { + "epoch": 0.498375, + "grad_norm": 2.171875, + "grad_norm_var": 0.0047686258951822914, + "learning_rate": 0.0001, + "loss": 7.0455, + "loss/crossentropy": 2.2208076119422913, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.2118864804506302, + "step": 7974 + }, + { + "epoch": 0.4985, + "grad_norm": 2.046875, + "grad_norm_var": 0.004939524332682291, + "learning_rate": 0.0001, + "loss": 7.228, + "loss/crossentropy": 2.2854151725769043, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20473261177539825, + "step": 7976 + }, + { + "epoch": 0.498625, + "grad_norm": 2.484375, + "grad_norm_var": 0.013315582275390625, + "learning_rate": 0.0001, + "loss": 7.1931, + "loss/crossentropy": 2.1924540996551514, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.22139258682727814, + "step": 7978 + }, + { + "epoch": 0.49875, + "grad_norm": 2.015625, + "grad_norm_var": 0.017267862955729168, + "learning_rate": 0.0001, + "loss": 7.061, + "loss/crossentropy": 2.3097928762435913, + "loss/hidden": 2.78125, + "loss/jsd": 0.0, + "loss/logits": 0.21281752735376358, + "step": 7980 + }, + { + "epoch": 0.498875, + "grad_norm": 2.15625, + "grad_norm_var": 0.016605631510416666, + "learning_rate": 0.0001, + "loss": 7.2304, + "loss/crossentropy": 2.5834310054779053, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20607245713472366, + "step": 7982 + }, + { + "epoch": 0.499, + "grad_norm": 1.984375, + "grad_norm_var": 0.01800537109375, + "learning_rate": 0.0001, + "loss": 7.168, + "loss/crossentropy": 2.2289873361587524, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.1980709359049797, + "step": 7984 + }, + { + "epoch": 0.499125, + "grad_norm": 2.15625, + "grad_norm_var": 0.02115453084309896, + "learning_rate": 0.0001, + "loss": 7.1071, + "loss/crossentropy": 2.3941906690597534, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2208108827471733, + "step": 7986 + }, + { + "epoch": 0.49925, + "grad_norm": 2.671875, + "grad_norm_var": 0.055149078369140625, + "learning_rate": 0.0001, + "loss": 7.3124, + "loss/crossentropy": 2.1894861459732056, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.218056783080101, + "step": 7988 + }, + { + "epoch": 0.499375, + "grad_norm": 2.0, + "grad_norm_var": 0.05746256510416667, + "learning_rate": 0.0001, + "loss": 7.0268, + "loss/crossentropy": 2.122998356819153, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.20764590054750443, + "step": 7990 + }, + { + "epoch": 0.4995, + "grad_norm": 2.21875, + "grad_norm_var": 0.056574503580729164, + "learning_rate": 0.0001, + "loss": 7.0048, + "loss/crossentropy": 2.28106951713562, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.20048896968364716, + "step": 7992 + }, + { + "epoch": 0.499625, + "grad_norm": 2.0625, + "grad_norm_var": 0.05073140462239583, + "learning_rate": 0.0001, + "loss": 7.197, + "loss/crossentropy": 2.450806140899658, + "loss/hidden": 2.828125, + "loss/jsd": 0.0, + "loss/logits": 0.20395724475383759, + "step": 7994 + }, + { + "epoch": 0.49975, + "grad_norm": 2.1875, + "grad_norm_var": 0.045171864827473956, + "learning_rate": 0.0001, + "loss": 7.0883, + "loss/crossentropy": 2.2239577770233154, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.22716625034809113, + "step": 7996 + }, + { + "epoch": 0.499875, + "grad_norm": 2.21875, + "grad_norm_var": 0.046213531494140626, + "learning_rate": 0.0001, + "loss": 7.1814, + "loss/crossentropy": 2.3571836948394775, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.20826996862888336, + "step": 7998 + }, + { + "epoch": 0.5, + "grad_norm": 2.0625, + "grad_norm_var": 0.04478530883789063, + "learning_rate": 0.0001, + "loss": 7.208, + "loss/crossentropy": 2.152498722076416, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.18669088929891586, + "step": 8000 + } + ], + "logging_steps": 2, + "max_steps": 16000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.666362485243904e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}