Upload weights and configs - Run 20251012_060013
Browse files- weights/best_model.safetensors +2 -2
- weights/best_model_metadata.json +290 -59
- weights/david_config.json +28 -12
- weights/train_config.json +16 -5
weights/best_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f94cb043f3e87c5d250a453926dd14b701bdde034e1ed7d0d3ca723de7a315e2
|
| 3 |
+
size 325845164
|
weights/best_model_metadata.json
CHANGED
|
@@ -1,46 +1,36 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
"optimizer_state_dict": {
|
| 4 |
"state": {
|
| 5 |
"0": {
|
| 6 |
-
"step": "tensor(
|
| 7 |
-
"exp_avg": "tensor([[
|
| 8 |
-
"exp_avg_sq": "tensor([[
|
| 9 |
},
|
| 10 |
"1": {
|
| 11 |
-
"step": "tensor(
|
| 12 |
-
"exp_avg": "tensor([
|
| 13 |
-
"exp_avg_sq": "tensor([
|
| 14 |
},
|
| 15 |
"2": {
|
| 16 |
-
"step": "tensor(
|
| 17 |
-
"exp_avg": "tensor([
|
| 18 |
-
"exp_avg_sq": "tensor([
|
| 19 |
},
|
| 20 |
"3": {
|
| 21 |
-
"step": "tensor(
|
| 22 |
-
"exp_avg": "tensor([
|
| 23 |
-
"exp_avg_sq": "tensor([
|
| 24 |
},
|
| 25 |
"4": {
|
| 26 |
-
"step": "tensor(
|
| 27 |
-
"exp_avg": "tensor([[
|
| 28 |
-
"exp_avg_sq": "tensor([[
|
| 29 |
-
},
|
| 30 |
-
"5": {
|
| 31 |
-
"step": "tensor(12520.)",
|
| 32 |
-
"exp_avg": "tensor([[ 1.7989e-06, 4.2190e-06, 6.5448e-06, ..., 3.0508e-06,\n -1.8514e-06, 5.9576e-06],\n [ 6.3176e-06, 1.6977e-05, 1.1505e-05, ..., 3.7173e-05,\n 4.9809e-06, 1.5525e-05],\n [-1.6708e-05, 9.5249e-06, -1.1249e-05, ..., -1.5451e-05,\n -7.9721e-06, 1.9833e-05],\n ...,\n [ 1.0364e-05, 1.1385e-05, -1.1192e-05, ..., -8.4844e-06,\n -5.6640e-06, -1.1401e-05],\n [-1.7906e-06, 1.9834e-06, 1.0815e-06, ..., -1.2380e-05,\n -6.2534e-06, -9.2382e-06],\n [ 9.6819e-06, -7.3784e-06, 4.8784e-06, ..., -1.4418e-05,\n -9.4953e-06, -7.2564e-06]], device='cuda:0')",
|
| 33 |
-
"exp_avg_sq": "tensor([[8.1560e-10, 1.3475e-09, 1.6525e-09, ..., 1.2488e-09, 5.9595e-10,\n 2.3552e-09],\n [1.7554e-09, 4.1794e-09, 1.5715e-09, ..., 1.7226e-09, 1.9271e-09,\n 2.3219e-09],\n [2.2232e-09, 4.9447e-09, 1.8756e-09, ..., 2.8584e-09, 1.2778e-09,\n 3.8880e-09],\n ...,\n [2.5199e-09, 3.9330e-09, 2.5599e-09, ..., 1.6822e-09, 1.6663e-09,\n 1.8369e-09],\n [2.6684e-09, 7.1982e-09, 2.0619e-09, ..., 2.2440e-09, 1.4449e-09,\n 2.9132e-09],\n [1.2986e-09, 3.4277e-09, 1.6512e-09, ..., 2.6567e-09, 1.1022e-09,\n 3.1725e-09]], device='cuda:0')"
|
| 34 |
-
},
|
| 35 |
-
"6": {
|
| 36 |
-
"step": "tensor(12520.)",
|
| 37 |
-
"exp_avg": "tensor([ 0.0002, -0.0002], device='cuda:0')",
|
| 38 |
-
"exp_avg_sq": "tensor([5.9675e-06, 5.9675e-06], device='cuda:0')"
|
| 39 |
}
|
| 40 |
},
|
| 41 |
"param_groups": [
|
| 42 |
{
|
| 43 |
-
"lr": 0.
|
| 44 |
"name": "shared",
|
| 45 |
"betas": [
|
| 46 |
0.9,
|
|
@@ -55,16 +45,14 @@
|
|
| 55 |
"differentiable": false,
|
| 56 |
"fused": null,
|
| 57 |
"decoupled_weight_decay": true,
|
| 58 |
-
"initial_lr": 0.
|
| 59 |
"params": [
|
| 60 |
0,
|
| 61 |
-
1
|
| 62 |
-
2,
|
| 63 |
-
3
|
| 64 |
]
|
| 65 |
},
|
| 66 |
{
|
| 67 |
-
"lr": 0.
|
| 68 |
"name": "scale_256",
|
| 69 |
"betas": [
|
| 70 |
0.9,
|
|
@@ -79,13 +67,15 @@
|
|
| 79 |
"differentiable": false,
|
| 80 |
"fused": null,
|
| 81 |
"decoupled_weight_decay": true,
|
| 82 |
-
"initial_lr": 0.
|
| 83 |
"params": [
|
|
|
|
|
|
|
| 84 |
4
|
| 85 |
]
|
| 86 |
},
|
| 87 |
{
|
| 88 |
-
"lr": 0.
|
| 89 |
"name": "scale_512",
|
| 90 |
"betas": [
|
| 91 |
0.9,
|
|
@@ -100,13 +90,199 @@
|
|
| 100 |
"differentiable": false,
|
| 101 |
"fused": null,
|
| 102 |
"decoupled_weight_decay": true,
|
| 103 |
-
"initial_lr": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
"params": [
|
| 105 |
-
|
|
|
|
|
|
|
| 106 |
]
|
| 107 |
},
|
| 108 |
{
|
| 109 |
-
"lr": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
"name": "fusion",
|
| 111 |
"betas": [
|
| 112 |
0.9,
|
|
@@ -121,60 +297,115 @@
|
|
| 121 |
"differentiable": false,
|
| 122 |
"fused": null,
|
| 123 |
"decoupled_weight_decay": true,
|
| 124 |
-
"initial_lr": 0.
|
| 125 |
"params": [
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
]
|
| 128 |
}
|
| 129 |
]
|
| 130 |
},
|
| 131 |
"scheduler_state_dict": {
|
| 132 |
"T_0": 10,
|
| 133 |
-
"T_i":
|
| 134 |
"T_mult": 2,
|
| 135 |
"eta_min": 1e-06,
|
| 136 |
-
"T_cur":
|
| 137 |
"base_lrs": [
|
| 138 |
-
0.
|
| 139 |
-
0.
|
| 140 |
-
0.
|
| 141 |
-
0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
],
|
| 143 |
-
"last_epoch":
|
| 144 |
"_step_count": 0,
|
| 145 |
"_is_initial": false,
|
| 146 |
"_get_lr_called_within_step": false,
|
| 147 |
"_last_lr": [
|
| 148 |
-
0.
|
| 149 |
-
0.
|
| 150 |
-
0.
|
| 151 |
-
0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
]
|
| 153 |
},
|
| 154 |
"metrics": {
|
| 155 |
-
"best_val_acc":
|
| 156 |
-
"best_epoch":
|
| 157 |
"scale_accuracies": {
|
| 158 |
-
"256":
|
| 159 |
-
"512": 75.302
|
| 160 |
}
|
| 161 |
},
|
| 162 |
"train_config": {
|
| 163 |
"name": "david_training",
|
| 164 |
"run_id": "20251012_060013",
|
| 165 |
"dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
|
| 166 |
-
"model_variant": "
|
| 167 |
"num_classes": 1000,
|
| 168 |
-
"preset": "
|
| 169 |
"custom_config_path": null,
|
| 170 |
"num_classes_override": null,
|
| 171 |
"use_belly_override": null,
|
| 172 |
"belly_expand_override": null,
|
| 173 |
-
"progressive_training_override":
|
| 174 |
-
"scale_warmup_epochs_override":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
"num_epochs": 10,
|
| 176 |
"batch_size": 1024,
|
| 177 |
-
"learning_rate": 0.
|
| 178 |
"weight_decay": 1e-05,
|
| 179 |
"warmup_epochs": 0,
|
| 180 |
"use_rose_loss": true,
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 0,
|
| 3 |
"optimizer_state_dict": {
|
| 4 |
"state": {
|
| 5 |
"0": {
|
| 6 |
+
"step": "tensor(1252.)",
|
| 7 |
+
"exp_avg": "tensor([[ 7.6931e-04, -3.5975e-04, -2.7454e-04, ..., 3.4282e-04,\n 1.6405e-04, -2.7417e-04],\n [-1.1996e-04, -1.1731e-04, -9.9866e-05, ..., 1.6243e-04,\n 2.2704e-04, 5.8436e-05],\n [-2.3594e-05, -1.2594e-04, 1.5107e-04, ..., -3.0109e-04,\n -7.9584e-05, 6.9697e-05],\n ...,\n [-1.2359e-04, -1.4653e-05, 2.1027e-04, ..., -1.2977e-04,\n 1.0232e-04, 1.3696e-04],\n [ 1.5723e-04, 9.8239e-05, -1.2136e-04, ..., 1.6422e-04,\n 1.6617e-04, -3.1527e-04],\n [ 3.0708e-04, -7.8846e-04, 5.5491e-05, ..., -8.6530e-05,\n 2.0349e-04, -1.3880e-04]], device='cuda:0')",
|
| 8 |
+
"exp_avg_sq": "tensor([[7.4118e-07, 1.0833e-06, 5.3915e-07, ..., 5.2782e-07, 4.1944e-07,\n 3.6632e-07],\n [3.1076e-07, 3.0220e-07, 3.1742e-07, ..., 2.1258e-07, 1.9540e-07,\n 1.6665e-07],\n [6.6361e-07, 7.0572e-07, 4.0617e-07, ..., 4.1036e-07, 3.5275e-07,\n 2.9789e-07],\n ...,\n [4.7457e-07, 3.5343e-07, 3.7321e-07, ..., 3.1882e-07, 2.7955e-07,\n 2.5810e-07],\n [5.7871e-07, 5.5442e-07, 4.4099e-07, ..., 3.3348e-07, 3.2790e-07,\n 2.9927e-07],\n [5.5174e-07, 5.0187e-07, 2.8968e-07, ..., 3.2638e-07, 3.1291e-07,\n 2.3588e-07]], device='cuda:0')"
|
| 9 |
},
|
| 10 |
"1": {
|
| 11 |
+
"step": "tensor(1252.)",
|
| 12 |
+
"exp_avg": "tensor([ 0.0155, -0.0063, -0.0059, ..., -0.0034, 0.0076, 0.0054],\n device='cuda:0')",
|
| 13 |
+
"exp_avg_sq": "tensor([0.0012, 0.0005, 0.0010, ..., 0.0009, 0.0010, 0.0007], device='cuda:0')"
|
| 14 |
},
|
| 15 |
"2": {
|
| 16 |
+
"step": "tensor(1252.)",
|
| 17 |
+
"exp_avg": "tensor([[ 1.3985e-05, -5.8933e-05, 1.0128e-04, ..., 2.2678e-04,\n -1.4801e-04, -2.0768e-05],\n [ 4.3151e-04, 1.1516e-04, -8.9230e-05, ..., 4.8063e-05,\n -2.3465e-04, 4.5604e-06],\n [ 5.6052e-45, 5.6052e-45, 5.6052e-45, ..., -3.1455e-32,\n 9.5699e-32, -5.6052e-45],\n ...,\n [ 6.3424e-09, -1.6865e-05, 2.2038e-07, ..., -4.1170e-07,\n -9.3469e-06, -1.4808e-07],\n [ 1.1432e-05, 3.8929e-05, -3.2332e-05, ..., 7.8134e-07,\n 4.3614e-05, -1.0174e-05],\n [-9.4698e-07, -2.8017e-07, -1.3962e-07, ..., -1.0875e-04,\n -4.0648e-07, -4.5041e-06]], device='cuda:0')",
|
| 18 |
+
"exp_avg_sq": "tensor([[1.0298e-06, 4.7161e-07, 2.3258e-07, ..., 1.3600e-06, 7.8774e-07,\n 6.1178e-07],\n [1.9419e-06, 8.5299e-07, 6.2341e-07, ..., 1.7004e-06, 8.0653e-07,\n 6.4486e-07],\n [1.7955e-10, 7.0238e-10, 3.7342e-11, ..., 5.1371e-10, 1.6562e-10,\n 1.7251e-10],\n ...,\n [7.4679e-11, 4.4698e-09, 6.7743e-11, ..., 3.5092e-10, 5.4696e-09,\n 2.4203e-10],\n [1.2516e-09, 6.6520e-08, 8.9164e-09, ..., 7.1810e-09, 1.6729e-08,\n 4.2439e-09],\n [1.1515e-08, 7.9548e-09, 6.6302e-09, ..., 7.8305e-07, 3.6041e-09,\n 6.1598e-08]], device='cuda:0')"
|
| 19 |
},
|
| 20 |
"3": {
|
| 21 |
+
"step": "tensor(1252.)",
|
| 22 |
+
"exp_avg": "tensor([ 2.3875e-02, -4.3837e-02, 9.6032e-26, 4.2214e-02, 2.2210e-02,\n 3.3747e-02, 1.7797e-02, -7.0004e-03, -2.4879e-02, 2.7619e-02,\n -7.5551e-03, 1.7134e-02, 2.0256e-02, -2.3314e-03, -7.4220e-02,\n -4.7382e-03, 1.4119e-02, -1.1545e-02, -8.9664e-03, 2.9760e-02,\n 2.9607e-03, -2.0514e-02, -7.0222e-03, -4.5160e-02, -2.3961e-02,\n 2.8237e-04, -1.7209e-02, 2.0788e-02, -7.2040e-03, 3.1665e-02,\n 5.6030e-03, 3.4553e-02, 3.6798e-02, -7.2686e-03, 3.0085e-02,\n 7.4137e-03, -6.1774e-03, 5.0785e-02, 4.7387e-03, 3.7325e-02,\n 2.7934e-02, -3.6341e-02, -2.1846e-02, 5.6052e-45, -2.4211e-02,\n 2.2644e-03, 5.5101e-02, 1.4802e-02, -5.4617e-03, -1.9478e-02,\n -6.5484e-03, -1.6414e-02, -4.0664e-04, 4.5538e-02, 5.8758e-03,\n 2.9059e-02, -1.2425e-02, 3.5538e-03, 4.1299e-03, 9.5794e-03,\n -4.0616e-02, 4.8972e-03, 2.5462e-02, 7.2303e-02, 1.5787e-03,\n 4.5761e-02, 1.9496e-02, 4.2323e-02, 3.9632e-02, -1.0082e-03,\n 5.3938e-03, 3.3493e-05, -1.3032e-02, -3.8153e-02, 1.5953e-02,\n 4.4613e-06, 7.0803e-03, -3.0465e-04, -9.4197e-03, -2.4202e-02,\n 9.6786e-02, -7.4001e-03, -2.9757e-02, 2.0539e-02, -2.8599e-02,\n 2.9271e-02, -1.7663e-03, 7.1197e-03, 2.4915e-02, 1.2627e-32,\n -3.4740e-02, 8.6782e-03, -2.3629e-09, 1.6301e-29, 1.5997e-02,\n 4.4684e-02, 3.8713e-04, -1.4058e-02, 2.3174e-02, -1.3123e-02,\n 9.1376e-03, -1.8078e-02, 1.6822e-02, 1.1673e-05, 1.4093e-02,\n 3.1474e-02, -1.2471e-02, -3.0086e-02, -2.3609e-02, 7.3296e-03,\n 5.4473e-36, 1.4037e-02, -1.1540e-02, 3.6438e-03, 2.6305e-03,\n -1.2727e-02, 2.7998e-03, -1.4901e-02, 2.1351e-02, -4.3767e-15,\n 1.8248e-31, 6.4915e-02, 1.7753e-02, 4.6665e-02, 1.1710e-02,\n -1.0307e-02, 2.7988e-02, 2.5342e-02, 2.9874e-02, -3.5108e-02,\n -2.3540e-03, -6.9490e-02, -7.4078e-03, 2.1446e-02, -1.5282e-02,\n -9.1815e-03, -5.9992e-03, -1.6854e-02, -1.2549e-02, -2.3482e-02,\n -2.0157e-02, 2.2761e-19, 5.0847e-02, 2.5370e-02, -9.5745e-03,\n 2.6735e-02, 6.3855e-03, 2.5707e-02, -3.6340e-02, -5.3170e-02,\n 2.5905e-02, -2.7917e-04, 5.1961e-02, -8.5205e-03, -2.0016e-02,\n 7.9647e-03, 2.6824e-28, -1.5696e-02, 1.9396e-02, 9.1115e-03,\n -5.6675e-02, 3.7877e-02, -2.4820e-03, 3.0127e-02, 3.7650e-02,\n 1.3167e-02, -6.0690e-02, 1.5634e-03, 2.0013e-02, -2.8012e-03,\n 2.7729e-02, 6.1169e-03, 7.8048e-04, 5.2743e-03, -3.7234e-02,\n -6.2202e-03, -4.8082e-03, 3.5392e-02, 2.1980e-02, 6.6747e-03,\n -5.1266e-04, 4.6985e-02, 5.6052e-45, 7.1385e-05, 3.2047e-03,\n -3.6594e-02, 1.8168e-02, -6.2917e-02, -3.2929e-02, -5.6694e-03,\n 1.9831e-02, -8.2062e-04, 1.5245e-02, -3.5059e-02, -7.1714e-03,\n 3.2220e-03, -2.1203e-04, -6.0919e-03, 1.4150e-02, -1.7816e-02,\n -1.6064e-02, 1.0769e-02, -2.6079e-02, -9.8037e-03, -3.2816e-02,\n -5.2574e-02, -4.7450e-03, 1.9516e-02, 3.4608e-03, 1.7244e-02,\n 2.6142e-13, 6.6794e-04, 2.7124e-02, -3.4816e-02, -3.1702e-02,\n 3.3221e-03, 3.8579e-02, -3.2285e-02, 1.9722e-02, 1.8849e-02,\n -3.3065e-02, 5.6052e-45, -7.1891e-03, -2.5881e-02, 6.2611e-03,\n -1.4979e-02, -6.4840e-03, 3.5860e-02, -2.7866e-02, -1.3252e-02,\n 2.1320e-02, 1.7814e-02, -3.2913e-02, -2.2322e-02, 2.0048e-03,\n 9.2803e-03, -8.2572e-03, -5.2975e-03, 7.6453e-02, 4.8804e-02,\n -2.1919e-02, 5.6761e-03, 6.3496e-02, -2.8889e-02, 1.3841e-02,\n 8.5695e-03, -1.5521e-02, 4.9531e-02, -4.1589e-02, -1.9676e-02,\n -5.5857e-03, -2.3588e-03, -1.4790e-02, -1.3780e-03, -6.3026e-03,\n -1.0188e-02, -3.3702e-02, 1.9226e-42, 2.2306e-02, 2.6239e-02,\n 4.8861e-26, -5.2627e-03, 2.5603e-06, 7.0623e-03, -4.3391e-03,\n 6.8443e-03, -1.6604e-02, -1.0740e-02, -1.5329e-02, 2.6949e-03,\n -8.0152e-03, -5.2252e-03, 8.3405e-03, 4.8676e-03, 2.9752e-02,\n 2.1431e-02, 5.0216e-02, 2.7263e-02, 8.0592e-03, 8.6727e-35,\n -2.2584e-02, -2.4758e-02, -2.9040e-02, -1.3793e-02, 1.8160e-03,\n -6.5728e-04, -4.3327e-02, 5.8940e-03, -1.1455e-02, -1.5254e-02,\n -1.3119e-02, -2.3658e-02, -6.1895e-03, 2.2715e-02, 2.3466e-02,\n 9.7893e-03, 3.0946e-02, -5.9481e-02, 6.9836e-09, 3.0786e-02,\n 2.9153e-02, -2.5735e-02, -7.1205e-03, -2.9654e-03, 3.6341e-02,\n -1.8624e-02, 2.3698e-02, -4.9125e-03, 1.7921e-02, -4.5526e-03,\n 2.6053e-02, -3.2306e-02, 7.3569e-02, -2.9119e-02, -8.4770e-03,\n 2.6320e-02, -3.1927e-02, 8.3118e-03, 2.1609e-02, 8.7364e-03,\n -6.7107e-03, 1.2142e-02, 8.1370e-03, -3.0566e-02, -1.4636e-02,\n 1.1678e-30, 6.2592e-05, 7.6997e-03, -5.0519e-02, 5.8788e-03,\n 3.3712e-02, 1.9211e-11, -2.0315e-02, 1.6669e-02, -4.9647e-03,\n -8.5470e-03, -5.1239e-03, 2.8877e-14, 8.1417e-08, -3.9018e-03,\n -2.0326e-02, -1.0452e-02, 4.1878e-03, -1.8927e-02, -4.9301e-02,\n -2.3229e-02, 1.7848e-03, 8.9783e-03, -1.2318e-02, 5.6052e-45,\n -1.9077e-02, 1.0073e-02, 1.1951e-02, -4.6252e-04, 5.6052e-45,\n -8.4038e-03, -4.9877e-03, 2.4088e-02, -1.3139e-02, -3.8403e-03,\n 2.5313e-03, -2.6408e-02, 7.3548e-04, -9.5045e-04, 3.6078e-03,\n 8.7422e-03, -1.8084e-02, -1.2749e-02, -1.7397e-04, -4.0832e-02,\n 1.0829e-02, -3.6196e-02, 6.3083e-23, 3.7562e-27, 3.7017e-03,\n 2.1969e-03, -1.5024e-02, -2.7693e-09, 2.8061e-03, -8.4934e-03,\n -1.6902e-02, 2.5158e-04, -9.0583e-03, -7.4322e-03, 2.2614e-02,\n 1.1432e-02, 7.6335e-03, 3.2598e-02, -8.4487e-03, 6.1142e-03,\n -4.9562e-03, 3.3930e-03, 2.8782e-02, -1.7597e-02, 8.4994e-04,\n 1.8249e-02, -2.3071e-02, 5.7733e-32, 8.4071e-30, -6.5391e-03,\n 2.4742e-02, -1.5463e-02, -3.7512e-02, 1.4492e-03, 1.0992e-03,\n -4.5201e-02, -1.0661e-02, 5.9124e-03, -4.8737e-04, 2.2978e-33,\n -2.1474e-05, -3.3565e-02, 1.7868e-02, -3.2502e-04, 1.2170e-05,\n 3.8928e-03, 9.7415e-03, 2.9308e-02, 1.1940e-02, 1.9191e-02,\n -8.1647e-03, 3.0442e-03, -3.8888e-03, -2.5077e-02, 1.2250e-02,\n -1.6413e-03, -2.5531e-02, -4.8607e-03, 2.5237e-02, 4.8066e-02,\n 7.1922e-03, -1.8733e-02, 1.1283e-02, 1.6394e-06, 4.4103e-03,\n 1.4819e-02, -8.5443e-03, -5.3989e-02, -1.8563e-02, -4.3936e-02,\n -1.3040e-02, 7.8469e-03, -6.3018e-02, 6.5663e-03, 6.0159e-03,\n 8.6966e-03, -7.3879e-03, -1.2783e-02, -3.4078e-02, 1.4064e-02,\n -2.5382e-02, -1.6579e-04, 2.1868e-02, -3.5772e-02, -6.2800e-03,\n 7.7235e-04, 1.8875e-03, -3.4342e-02, 2.7571e-02, -1.1460e-02,\n 3.1115e-02, -4.2305e-03, 7.3832e-40, -8.5955e-04, -1.5269e-02,\n 2.9632e-02, 5.7214e-02, 6.3015e-02, -3.4968e-02, 2.6324e-02,\n 1.2256e-02, 8.9374e-13, 3.2504e-02, -3.6663e-02, 2.8071e-19,\n 8.0190e-02, -5.5870e-03, -3.0679e-03, 2.4492e-03, 3.3198e-02,\n -2.3608e-03, 5.6052e-45, 1.9051e-02, 5.2085e-02, 1.8931e-02,\n 6.1182e-03, -6.7992e-02, 5.6052e-45, 5.1587e-03, 3.5890e-02,\n 2.1678e-02, -1.4406e-02, -5.4601e-03, 1.9144e-02, 1.2888e-02,\n 5.7364e-05, 1.1321e-02, -3.3445e-02, -1.4640e-04, 3.2436e-07,\n -2.9884e-03, -4.3738e-02, 1.6330e-17, -4.8169e-02, 9.6817e-03,\n 2.2606e-25, 3.1364e-02, 5.9806e-04, -2.9483e-02, -9.8904e-04,\n 2.6328e-03, 8.8098e-03], device='cuda:0')",
|
| 23 |
+
"exp_avg_sq": "tensor([1.8764e-02, 1.8272e-02, 2.1116e-06, 1.3778e-02, 3.6853e-03, 6.6573e-03,\n 1.8188e-02, 1.7011e-02, 1.8539e-02, 1.0614e-02, 3.9648e-03, 2.1009e-02,\n 3.1433e-03, 5.8576e-03, 1.6608e-02, 7.1616e-05, 4.8830e-03, 1.9907e-02,\n 6.7086e-03, 3.3539e-03, 4.3796e-03, 1.7794e-02, 5.7634e-03, 1.8685e-02,\n 1.5659e-02, 2.9941e-04, 1.5636e-02, 1.4540e-02, 1.8737e-02, 1.6553e-02,\n 1.5784e-02, 1.6078e-02, 1.6327e-02, 7.0099e-03, 1.0621e-02, 1.5467e-02,\n 1.3924e-03, 1.5506e-02, 1.8211e-02, 1.6032e-02, 1.1937e-02, 1.1553e-02,\n 4.5409e-04, 1.0012e-08, 1.4126e-02, 1.8995e-03, 2.2158e-02, 1.6567e-02,\n 1.8036e-02, 1.7208e-02, 4.1166e-04, 1.7573e-02, 3.8096e-03, 1.2772e-02,\n 2.0461e-02, 1.2595e-02, 2.1758e-02, 2.0982e-02, 1.5789e-02, 7.3397e-03,\n 1.5701e-02, 1.0106e-02, 1.5815e-02, 1.6533e-02, 2.0341e-02, 1.5778e-02,\n 5.0521e-03, 1.5900e-02, 2.2204e-02, 1.7102e-03, 1.6514e-02, 1.9365e-06,\n 8.2713e-03, 1.6110e-02, 1.5514e-02, 9.3026e-08, 1.3545e-02, 1.7353e-02,\n 1.5866e-02, 1.7169e-02, 1.9033e-02, 1.5958e-02, 8.2837e-03, 1.6548e-02,\n 2.2114e-02, 8.9850e-03, 1.6406e-02, 1.7232e-03, 1.2786e-02, 3.1307e-07,\n 4.9854e-03, 4.0865e-03, 3.4394e-07, 5.4478e-07, 1.4107e-02, 1.9926e-02,\n 1.0058e-04, 1.7903e-02, 1.6516e-02, 6.1304e-03, 4.3734e-03, 2.0879e-03,\n 3.7778e-03, 4.0758e-06, 1.6172e-02, 1.1247e-02, 1.8040e-02, 1.0636e-02,\n 1.0308e-02, 2.7883e-03, 5.5917e-06, 1.4616e-02, 6.2573e-03, 1.4898e-03,\n 1.7158e-02, 1.9321e-02, 1.9991e-02, 1.6465e-02, 8.9090e-03, 2.0490e-06,\n 1.1226e-07, 1.4761e-02, 8.8807e-03, 1.9667e-02, 1.7528e-02, 6.0540e-03,\n 1.6431e-02, 1.7108e-02, 1.7352e-02, 1.5799e-02, 1.6022e-02, 1.8787e-02,\n 3.5264e-03, 1.8629e-02, 1.8250e-02, 1.5643e-02, 1.5740e-02, 7.6894e-03,\n 1.5372e-02, 1.8558e-02, 2.0713e-02, 4.3730e-05, 2.0132e-02, 9.0030e-03,\n 1.6618e-02, 1.6743e-02, 1.6162e-02, 1.8333e-02, 1.7295e-02, 1.8789e-02,\n 1.4703e-02, 5.6712e-05, 1.4897e-02, 8.1449e-03, 2.0252e-02, 1.8296e-02,\n 5.1933e-08, 1.6048e-02, 9.2359e-04, 7.3904e-05, 1.8577e-02, 1.9959e-02,\n 1.3895e-02, 1.3569e-02, 1.6303e-02, 1.8068e-02, 1.5074e-02, 2.5697e-03,\n 1.5315e-02, 1.4183e-02, 1.5812e-02, 1.2953e-02, 6.2200e-03, 1.9376e-02,\n 1.8279e-02, 8.9495e-04, 1.8078e-02, 1.3957e-02, 1.7215e-02, 7.2711e-03,\n 1.5375e-02, 1.7679e-02, 3.1085e-06, 1.3844e-03, 1.1514e-02, 2.0096e-02,\n 1.6912e-02, 1.6843e-02, 5.8150e-03, 3.1123e-03, 6.3333e-03, 3.7299e-03,\n 1.5953e-02, 1.6468e-02, 2.5355e-04, 9.9471e-05, 1.1112e-03, 6.1182e-03,\n 1.0702e-02, 1.7909e-02, 1.3577e-03, 1.4747e-02, 9.0966e-03, 6.6425e-03,\n 1.2400e-02, 1.7522e-02, 2.6620e-03, 4.5561e-03, 4.0022e-03, 1.7254e-02,\n 6.1405e-06, 1.7645e-02, 1.8053e-02, 1.9364e-02, 2.1054e-02, 1.9905e-02,\n 1.6793e-02, 1.6094e-02, 1.1116e-02, 1.6410e-02, 1.6261e-02, 1.4181e-08,\n 9.1015e-03, 1.8053e-02, 5.7518e-03, 1.5482e-02, 2.0410e-04, 8.1102e-03,\n 1.8365e-02, 1.6995e-02, 1.0036e-03, 1.8236e-02, 1.5998e-02, 1.5762e-02,\n 1.6276e-02, 8.9184e-04, 1.7372e-02, 6.1681e-03, 1.8249e-02, 1.0992e-02,\n 1.4126e-02, 1.3468e-02, 1.8014e-02, 2.9758e-03, 1.7502e-02, 2.8353e-03,\n 1.8404e-02, 1.5680e-02, 1.7732e-02, 1.6440e-02, 1.8413e-02, 4.7115e-03,\n 1.8638e-02, 1.9537e-03, 1.8190e-02, 7.1188e-03, 8.9995e-03, 1.5712e-05,\n 1.6346e-02, 6.8524e-03, 6.3748e-07, 3.1674e-03, 9.1334e-07, 9.3132e-04,\n 5.6603e-03, 1.2174e-02, 1.5160e-02, 1.9009e-02, 1.5638e-02, 4.7673e-03,\n 1.7016e-02, 3.0314e-03, 1.9739e-02, 1.5944e-02, 1.2943e-02, 1.5510e-02,\n 8.6399e-03, 5.6596e-03, 6.2255e-03, 9.5688e-06, 1.6686e-02, 2.0533e-02,\n 1.8495e-02, 4.6201e-03, 4.1361e-03, 2.0182e-02, 3.7870e-03, 1.5436e-02,\n 1.1629e-02, 3.4890e-03, 1.0200e-02, 1.6730e-02, 1.2250e-02, 1.8554e-02,\n 6.7240e-03, 1.4706e-02, 1.7141e-02, 2.3400e-03, 2.9501e-06, 6.1673e-03,\n 7.6003e-03, 1.7310e-02, 1.4075e-02, 1.7002e-02, 1.7218e-02, 1.2266e-02,\n 1.5216e-02, 1.2400e-02, 1.8126e-02, 2.8434e-03, 1.3225e-02, 1.6580e-02,\n 1.9668e-02, 1.6248e-02, 1.6331e-02, 1.6534e-02, 2.1903e-03, 1.7074e-02,\n 2.6222e-03, 1.8165e-02, 1.4966e-02, 1.8902e-02, 1.3500e-02, 1.9297e-02,\n 7.4373e-03, 2.9604e-06, 1.3037e-05, 1.3654e-02, 1.4526e-02, 1.7412e-02,\n 1.7856e-02, 1.8401e-06, 1.7803e-02, 2.2227e-02, 1.4747e-04, 1.8303e-02,\n 3.3472e-03, 9.2779e-08, 1.2950e-06, 1.8184e-02, 1.4094e-02, 1.8324e-02,\n 1.6660e-02, 1.4634e-02, 1.3233e-02, 1.8590e-02, 1.6637e-02, 1.5577e-02,\n 1.6404e-02, 2.8314e-06, 1.8143e-03, 1.1277e-02, 1.6750e-02, 1.8065e-02,\n 2.6220e-06, 1.8991e-03, 1.8023e-02, 1.7743e-02, 1.2555e-02, 2.2086e-02,\n 1.7756e-02, 1.5363e-02, 1.6971e-02, 1.9232e-02, 1.6353e-02, 9.7520e-05,\n 1.6294e-02, 1.4263e-02, 1.6250e-02, 1.3664e-02, 3.4210e-03, 1.7195e-02,\n 2.6770e-06, 3.7029e-09, 2.2782e-03, 1.1965e-03, 5.5402e-03, 1.0826e-05,\n 1.7758e-02, 1.7853e-03, 1.6260e-02, 1.7396e-02, 8.8334e-03, 1.2946e-02,\n 4.7707e-03, 1.8341e-02, 1.5500e-02, 8.6413e-03, 1.4960e-02, 5.0860e-03,\n 1.8427e-02, 1.7727e-02, 1.8552e-02, 1.7232e-02, 1.5772e-02, 1.3550e-02,\n 1.9445e-02, 1.3188e-07, 1.7274e-05, 1.7283e-02, 1.4432e-02, 2.2199e-02,\n 1.2002e-02, 8.0722e-03, 2.4180e-03, 1.4398e-02, 1.2979e-02, 4.3121e-03,\n 7.7428e-03, 3.5046e-05, 1.9069e-05, 1.7086e-02, 1.0107e-02, 1.2835e-02,\n 3.0826e-05, 1.8859e-02, 1.6542e-02, 1.5748e-02, 2.0743e-03, 2.3803e-03,\n 1.6765e-02, 1.3605e-02, 5.1618e-03, 1.0833e-02, 1.4888e-02, 2.4556e-03,\n 1.4536e-02, 1.6998e-02, 9.2375e-03, 2.1265e-02, 2.0489e-03, 9.7723e-04,\n 1.7579e-02, 5.3052e-06, 1.5887e-04, 1.5638e-02, 1.5564e-02, 1.2871e-02,\n 1.6129e-02, 2.4327e-02, 4.8870e-03, 1.5943e-02, 2.0468e-02, 4.0398e-03,\n 1.9566e-02, 2.2624e-02, 1.2940e-02, 2.5075e-03, 1.6921e-02, 1.2306e-02,\n 1.5474e-02, 6.0880e-04, 1.8748e-02, 1.0882e-02, 1.6619e-03, 1.4346e-02,\n 1.6405e-02, 1.0783e-02, 1.7167e-02, 1.4428e-02, 1.5860e-02, 1.2360e-02,\n 1.4884e-09, 3.8985e-03, 6.8472e-03, 1.8204e-02, 1.3519e-02, 2.2676e-02,\n 1.2496e-02, 1.7735e-02, 1.2627e-02, 7.9700e-06, 1.8581e-02, 1.5395e-02,\n 1.4274e-05, 1.6384e-02, 7.2122e-03, 3.6154e-03, 1.5931e-02, 4.0571e-03,\n 1.8472e-02, 2.9771e-09, 2.8081e-03, 1.4730e-02, 1.5367e-02, 2.8912e-03,\n 2.0022e-02, 2.0257e-05, 1.3164e-02, 1.8537e-02, 1.3385e-02, 1.6409e-02,\n 1.2622e-02, 8.2167e-03, 1.4872e-02, 4.0220e-03, 1.2890e-02, 1.4525e-02,\n 5.0494e-04, 1.9957e-06, 4.1671e-04, 1.9199e-02, 3.1829e-07, 1.5725e-02,\n 1.4055e-02, 7.4773e-07, 1.3554e-02, 1.2199e-02, 5.8713e-03, 1.0020e-04,\n 1.9919e-04, 2.3390e-03], device='cuda:0')"
|
| 24 |
},
|
| 25 |
"4": {
|
| 26 |
+
"step": "tensor(1252.)",
|
| 27 |
+
"exp_avg": "tensor([[ 3.9230e-04, 8.6013e-05, -4.0562e-28, ..., -8.0357e-06,\n 3.1746e-05, -1.4099e-05],\n [ 4.2048e-04, 1.7068e-04, -2.2865e-28, ..., -1.9681e-05,\n -7.8058e-05, -9.3920e-05],\n [-5.8150e-04, -5.1838e-05, 1.9939e-28, ..., 3.0194e-07,\n 1.7539e-04, -8.2012e-06],\n ...,\n [ 3.6278e-04, 3.8798e-05, 4.8383e-29, ..., -1.4735e-05,\n -9.2320e-06, 1.6069e-05],\n [-3.5278e-04, -2.1873e-04, -1.6439e-28, ..., 3.7640e-05,\n -1.1982e-04, 3.7282e-05],\n [-6.4497e-05, 2.1634e-04, -2.8426e-28, ..., 5.2144e-06,\n 1.2079e-05, 1.8201e-05]], device='cuda:0')",
|
| 28 |
+
"exp_avg_sq": "tensor([[1.0953e-06, 1.3442e-06, 1.1048e-10, ..., 3.9482e-09, 8.9938e-09,\n 1.8019e-08],\n [2.1084e-06, 2.6551e-06, 1.6841e-10, ..., 1.5030e-09, 2.3215e-08,\n 9.7827e-08],\n [1.8753e-06, 2.5430e-06, 4.6574e-10, ..., 2.7687e-09, 8.8683e-09,\n 6.6008e-08],\n ...,\n [1.7657e-06, 2.8547e-06, 1.5489e-10, ..., 2.8203e-09, 1.1983e-08,\n 6.1273e-08],\n [1.7073e-06, 2.9791e-06, 1.0857e-10, ..., 2.7219e-09, 1.2247e-08,\n 6.0122e-08],\n [2.1537e-06, 3.3493e-06, 1.6559e-10, ..., 2.1838e-09, 2.0388e-08,\n 3.8560e-08]], device='cuda:0')"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
}
|
| 30 |
},
|
| 31 |
"param_groups": [
|
| 32 |
{
|
| 33 |
+
"lr": 0.0009755527298894294,
|
| 34 |
"name": "shared",
|
| 35 |
"betas": [
|
| 36 |
0.9,
|
|
|
|
| 45 |
"differentiable": false,
|
| 46 |
"fused": null,
|
| 47 |
"decoupled_weight_decay": true,
|
| 48 |
+
"initial_lr": 0.001,
|
| 49 |
"params": [
|
| 50 |
0,
|
| 51 |
+
1
|
|
|
|
|
|
|
| 52 |
]
|
| 53 |
},
|
| 54 |
{
|
| 55 |
+
"lr": 0.0009755527298894294,
|
| 56 |
"name": "scale_256",
|
| 57 |
"betas": [
|
| 58 |
0.9,
|
|
|
|
| 67 |
"differentiable": false,
|
| 68 |
"fused": null,
|
| 69 |
"decoupled_weight_decay": true,
|
| 70 |
+
"initial_lr": 0.001,
|
| 71 |
"params": [
|
| 72 |
+
2,
|
| 73 |
+
3,
|
| 74 |
4
|
| 75 |
]
|
| 76 |
},
|
| 77 |
{
|
| 78 |
+
"lr": 0.0009755527298894294,
|
| 79 |
"name": "scale_512",
|
| 80 |
"betas": [
|
| 81 |
0.9,
|
|
|
|
| 90 |
"differentiable": false,
|
| 91 |
"fused": null,
|
| 92 |
"decoupled_weight_decay": true,
|
| 93 |
+
"initial_lr": 0.001,
|
| 94 |
+
"params": [
|
| 95 |
+
5,
|
| 96 |
+
6,
|
| 97 |
+
7
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"lr": 0.0009755527298894294,
|
| 102 |
+
"name": "scale_768",
|
| 103 |
+
"betas": [
|
| 104 |
+
0.9,
|
| 105 |
+
0.999
|
| 106 |
+
],
|
| 107 |
+
"eps": 1e-08,
|
| 108 |
+
"weight_decay": 1e-05,
|
| 109 |
+
"amsgrad": false,
|
| 110 |
+
"maximize": false,
|
| 111 |
+
"foreach": null,
|
| 112 |
+
"capturable": false,
|
| 113 |
+
"differentiable": false,
|
| 114 |
+
"fused": null,
|
| 115 |
+
"decoupled_weight_decay": true,
|
| 116 |
+
"initial_lr": 0.001,
|
| 117 |
"params": [
|
| 118 |
+
8,
|
| 119 |
+
9,
|
| 120 |
+
10
|
| 121 |
]
|
| 122 |
},
|
| 123 |
{
|
| 124 |
+
"lr": 0.0009755527298894294,
|
| 125 |
+
"name": "scale_1024",
|
| 126 |
+
"betas": [
|
| 127 |
+
0.9,
|
| 128 |
+
0.999
|
| 129 |
+
],
|
| 130 |
+
"eps": 1e-08,
|
| 131 |
+
"weight_decay": 1e-05,
|
| 132 |
+
"amsgrad": false,
|
| 133 |
+
"maximize": false,
|
| 134 |
+
"foreach": null,
|
| 135 |
+
"capturable": false,
|
| 136 |
+
"differentiable": false,
|
| 137 |
+
"fused": null,
|
| 138 |
+
"decoupled_weight_decay": true,
|
| 139 |
+
"initial_lr": 0.001,
|
| 140 |
+
"params": [
|
| 141 |
+
11,
|
| 142 |
+
12,
|
| 143 |
+
13
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"lr": 0.0009755527298894294,
|
| 148 |
+
"name": "scale_1280",
|
| 149 |
+
"betas": [
|
| 150 |
+
0.9,
|
| 151 |
+
0.999
|
| 152 |
+
],
|
| 153 |
+
"eps": 1e-08,
|
| 154 |
+
"weight_decay": 1e-05,
|
| 155 |
+
"amsgrad": false,
|
| 156 |
+
"maximize": false,
|
| 157 |
+
"foreach": null,
|
| 158 |
+
"capturable": false,
|
| 159 |
+
"differentiable": false,
|
| 160 |
+
"fused": null,
|
| 161 |
+
"decoupled_weight_decay": true,
|
| 162 |
+
"initial_lr": 0.001,
|
| 163 |
+
"params": [
|
| 164 |
+
14,
|
| 165 |
+
15,
|
| 166 |
+
16
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"lr": 0.0009755527298894294,
|
| 171 |
+
"name": "scale_1536",
|
| 172 |
+
"betas": [
|
| 173 |
+
0.9,
|
| 174 |
+
0.999
|
| 175 |
+
],
|
| 176 |
+
"eps": 1e-08,
|
| 177 |
+
"weight_decay": 1e-05,
|
| 178 |
+
"amsgrad": false,
|
| 179 |
+
"maximize": false,
|
| 180 |
+
"foreach": null,
|
| 181 |
+
"capturable": false,
|
| 182 |
+
"differentiable": false,
|
| 183 |
+
"fused": null,
|
| 184 |
+
"decoupled_weight_decay": true,
|
| 185 |
+
"initial_lr": 0.001,
|
| 186 |
+
"params": [
|
| 187 |
+
17,
|
| 188 |
+
18,
|
| 189 |
+
19
|
| 190 |
+
]
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"lr": 0.0009755527298894294,
|
| 194 |
+
"name": "scale_1792",
|
| 195 |
+
"betas": [
|
| 196 |
+
0.9,
|
| 197 |
+
0.999
|
| 198 |
+
],
|
| 199 |
+
"eps": 1e-08,
|
| 200 |
+
"weight_decay": 1e-05,
|
| 201 |
+
"amsgrad": false,
|
| 202 |
+
"maximize": false,
|
| 203 |
+
"foreach": null,
|
| 204 |
+
"capturable": false,
|
| 205 |
+
"differentiable": false,
|
| 206 |
+
"fused": null,
|
| 207 |
+
"decoupled_weight_decay": true,
|
| 208 |
+
"initial_lr": 0.001,
|
| 209 |
+
"params": [
|
| 210 |
+
20,
|
| 211 |
+
21,
|
| 212 |
+
22
|
| 213 |
+
]
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"lr": 0.0009755527298894294,
|
| 217 |
+
"name": "scale_2048",
|
| 218 |
+
"betas": [
|
| 219 |
+
0.9,
|
| 220 |
+
0.999
|
| 221 |
+
],
|
| 222 |
+
"eps": 1e-08,
|
| 223 |
+
"weight_decay": 1e-05,
|
| 224 |
+
"amsgrad": false,
|
| 225 |
+
"maximize": false,
|
| 226 |
+
"foreach": null,
|
| 227 |
+
"capturable": false,
|
| 228 |
+
"differentiable": false,
|
| 229 |
+
"fused": null,
|
| 230 |
+
"decoupled_weight_decay": true,
|
| 231 |
+
"initial_lr": 0.001,
|
| 232 |
+
"params": [
|
| 233 |
+
23,
|
| 234 |
+
24,
|
| 235 |
+
25
|
| 236 |
+
]
|
| 237 |
+
},
|
| 238 |
+
{
|
| 239 |
+
"lr": 0.0009755527298894294,
|
| 240 |
+
"name": "scale_2304",
|
| 241 |
+
"betas": [
|
| 242 |
+
0.9,
|
| 243 |
+
0.999
|
| 244 |
+
],
|
| 245 |
+
"eps": 1e-08,
|
| 246 |
+
"weight_decay": 1e-05,
|
| 247 |
+
"amsgrad": false,
|
| 248 |
+
"maximize": false,
|
| 249 |
+
"foreach": null,
|
| 250 |
+
"capturable": false,
|
| 251 |
+
"differentiable": false,
|
| 252 |
+
"fused": null,
|
| 253 |
+
"decoupled_weight_decay": true,
|
| 254 |
+
"initial_lr": 0.001,
|
| 255 |
+
"params": [
|
| 256 |
+
26,
|
| 257 |
+
27,
|
| 258 |
+
28
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"lr": 0.0009755527298894294,
|
| 263 |
+
"name": "scale_2560",
|
| 264 |
+
"betas": [
|
| 265 |
+
0.9,
|
| 266 |
+
0.999
|
| 267 |
+
],
|
| 268 |
+
"eps": 1e-08,
|
| 269 |
+
"weight_decay": 1e-05,
|
| 270 |
+
"amsgrad": false,
|
| 271 |
+
"maximize": false,
|
| 272 |
+
"foreach": null,
|
| 273 |
+
"capturable": false,
|
| 274 |
+
"differentiable": false,
|
| 275 |
+
"fused": null,
|
| 276 |
+
"decoupled_weight_decay": true,
|
| 277 |
+
"initial_lr": 0.001,
|
| 278 |
+
"params": [
|
| 279 |
+
29,
|
| 280 |
+
30,
|
| 281 |
+
31
|
| 282 |
+
]
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"lr": 0.00048778860081564085,
|
| 286 |
"name": "fusion",
|
| 287 |
"betas": [
|
| 288 |
0.9,
|
|
|
|
| 297 |
"differentiable": false,
|
| 298 |
"fused": null,
|
| 299 |
"decoupled_weight_decay": true,
|
| 300 |
+
"initial_lr": 0.0005,
|
| 301 |
"params": [
|
| 302 |
+
32,
|
| 303 |
+
33,
|
| 304 |
+
34,
|
| 305 |
+
35,
|
| 306 |
+
36,
|
| 307 |
+
37,
|
| 308 |
+
38,
|
| 309 |
+
39,
|
| 310 |
+
40,
|
| 311 |
+
41,
|
| 312 |
+
42,
|
| 313 |
+
43,
|
| 314 |
+
44,
|
| 315 |
+
45,
|
| 316 |
+
46,
|
| 317 |
+
47,
|
| 318 |
+
48,
|
| 319 |
+
49,
|
| 320 |
+
50,
|
| 321 |
+
51,
|
| 322 |
+
52,
|
| 323 |
+
53,
|
| 324 |
+
54,
|
| 325 |
+
55,
|
| 326 |
+
56,
|
| 327 |
+
57,
|
| 328 |
+
58,
|
| 329 |
+
59,
|
| 330 |
+
60,
|
| 331 |
+
61
|
| 332 |
]
|
| 333 |
}
|
| 334 |
]
|
| 335 |
},
|
| 336 |
"scheduler_state_dict": {
|
| 337 |
"T_0": 10,
|
| 338 |
+
"T_i": 10,
|
| 339 |
"T_mult": 2,
|
| 340 |
"eta_min": 1e-06,
|
| 341 |
+
"T_cur": 1,
|
| 342 |
"base_lrs": [
|
| 343 |
+
0.001,
|
| 344 |
+
0.001,
|
| 345 |
+
0.001,
|
| 346 |
+
0.001,
|
| 347 |
+
0.001,
|
| 348 |
+
0.001,
|
| 349 |
+
0.001,
|
| 350 |
+
0.001,
|
| 351 |
+
0.001,
|
| 352 |
+
0.001,
|
| 353 |
+
0.001,
|
| 354 |
+
0.0005
|
| 355 |
],
|
| 356 |
+
"last_epoch": 1,
|
| 357 |
"_step_count": 0,
|
| 358 |
"_is_initial": false,
|
| 359 |
"_get_lr_called_within_step": false,
|
| 360 |
"_last_lr": [
|
| 361 |
+
0.0009755527298894294,
|
| 362 |
+
0.0009755527298894294,
|
| 363 |
+
0.0009755527298894294,
|
| 364 |
+
0.0009755527298894294,
|
| 365 |
+
0.0009755527298894294,
|
| 366 |
+
0.0009755527298894294,
|
| 367 |
+
0.0009755527298894294,
|
| 368 |
+
0.0009755527298894294,
|
| 369 |
+
0.0009755527298894294,
|
| 370 |
+
0.0009755527298894294,
|
| 371 |
+
0.0009755527298894294,
|
| 372 |
+
0.00048778860081564085
|
| 373 |
]
|
| 374 |
},
|
| 375 |
"metrics": {
|
| 376 |
+
"best_val_acc": 80.786,
|
| 377 |
+
"best_epoch": 0,
|
| 378 |
"scale_accuracies": {
|
| 379 |
+
"256": 80.786
|
|
|
|
| 380 |
}
|
| 381 |
},
|
| 382 |
"train_config": {
|
| 383 |
"name": "david_training",
|
| 384 |
"run_id": "20251012_060013",
|
| 385 |
"dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
|
| 386 |
+
"model_variant": "clip_vit_l14",
|
| 387 |
"num_classes": 1000,
|
| 388 |
+
"preset": "clip_vit_l14_deep",
|
| 389 |
"custom_config_path": null,
|
| 390 |
"num_classes_override": null,
|
| 391 |
"use_belly_override": null,
|
| 392 |
"belly_expand_override": null,
|
| 393 |
+
"progressive_training_override": true,
|
| 394 |
+
"scale_warmup_epochs_override": {
|
| 395 |
+
"256": 0,
|
| 396 |
+
"512": 1,
|
| 397 |
+
"768": 2,
|
| 398 |
+
"1024": 3,
|
| 399 |
+
"1280": 4,
|
| 400 |
+
"1536": 5,
|
| 401 |
+
"1792": 6,
|
| 402 |
+
"2048": 7,
|
| 403 |
+
"2304": 8,
|
| 404 |
+
"2560": 9
|
| 405 |
+
},
|
| 406 |
"num_epochs": 10,
|
| 407 |
"batch_size": 1024,
|
| 408 |
+
"learning_rate": 0.001,
|
| 409 |
"weight_decay": 1e-05,
|
| 410 |
"warmup_epochs": 0,
|
| 411 |
"use_rose_loss": true,
|
weights/david_config.json
CHANGED
|
@@ -1,29 +1,45 @@
|
|
| 1 |
{
|
| 2 |
-
"name": "
|
| 3 |
-
"uid": "c.david.
|
| 4 |
-
"feature_dim":
|
| 5 |
"num_classes": 1000,
|
| 6 |
"scales": [
|
| 7 |
256,
|
| 8 |
-
512
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
],
|
| 10 |
-
"sharing_mode": "
|
| 11 |
-
"fusion_mode": "
|
| 12 |
-
"use_belly":
|
| 13 |
"belly_expand": 2.0,
|
| 14 |
-
"shared_feature_dim":
|
| 15 |
-
"shared_layers":
|
| 16 |
"shared_dropout": 0.1,
|
| 17 |
"fusion_temperature": 1.0,
|
| 18 |
"fusion_dropout": 0.1,
|
| 19 |
"tree_depth": 3,
|
| 20 |
-
"num_experts":
|
| 21 |
"compression_ratio": 4,
|
| 22 |
"expert_dropout": 0.1,
|
| 23 |
"attention_dropout": 0.1,
|
| 24 |
-
"progressive_training":
|
| 25 |
"scale_warmup_epochs": {
|
| 26 |
"256": 0,
|
| 27 |
-
"512":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
}
|
| 29 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"name": "david_clip_vit_l14_deep",
|
| 3 |
+
"uid": "c.david.clip_vit_l14_deep",
|
| 4 |
+
"feature_dim": 768,
|
| 5 |
"num_classes": 1000,
|
| 6 |
"scales": [
|
| 7 |
256,
|
| 8 |
+
512,
|
| 9 |
+
768,
|
| 10 |
+
1024,
|
| 11 |
+
1280,
|
| 12 |
+
1536,
|
| 13 |
+
1792,
|
| 14 |
+
2048,
|
| 15 |
+
2304,
|
| 16 |
+
2560
|
| 17 |
],
|
| 18 |
+
"sharing_mode": "partial_shared",
|
| 19 |
+
"fusion_mode": "deep_efficiency",
|
| 20 |
+
"use_belly": true,
|
| 21 |
"belly_expand": 2.0,
|
| 22 |
+
"shared_feature_dim": 1024,
|
| 23 |
+
"shared_layers": 4,
|
| 24 |
"shared_dropout": 0.1,
|
| 25 |
"fusion_temperature": 1.0,
|
| 26 |
"fusion_dropout": 0.1,
|
| 27 |
"tree_depth": 3,
|
| 28 |
+
"num_experts": 4,
|
| 29 |
"compression_ratio": 4,
|
| 30 |
"expert_dropout": 0.1,
|
| 31 |
"attention_dropout": 0.1,
|
| 32 |
+
"progressive_training": true,
|
| 33 |
"scale_warmup_epochs": {
|
| 34 |
"256": 0,
|
| 35 |
+
"512": 1,
|
| 36 |
+
"768": 2,
|
| 37 |
+
"1024": 3,
|
| 38 |
+
"1280": 4,
|
| 39 |
+
"1536": 5,
|
| 40 |
+
"1792": 6,
|
| 41 |
+
"2048": 7,
|
| 42 |
+
"2304": 8,
|
| 43 |
+
"2560": 9
|
| 44 |
}
|
| 45 |
}
|
weights/train_config.json
CHANGED
|
@@ -2,18 +2,29 @@
|
|
| 2 |
"name": "david_training",
|
| 3 |
"run_id": "20251012_060013",
|
| 4 |
"dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
|
| 5 |
-
"model_variant": "
|
| 6 |
"num_classes": 1000,
|
| 7 |
-
"preset": "
|
| 8 |
"custom_config_path": null,
|
| 9 |
"num_classes_override": null,
|
| 10 |
"use_belly_override": null,
|
| 11 |
"belly_expand_override": null,
|
| 12 |
-
"progressive_training_override":
|
| 13 |
-
"scale_warmup_epochs_override":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"num_epochs": 10,
|
| 15 |
"batch_size": 1024,
|
| 16 |
-
"learning_rate": 0.
|
| 17 |
"weight_decay": 1e-05,
|
| 18 |
"warmup_epochs": 0,
|
| 19 |
"use_rose_loss": true,
|
|
|
|
| 2 |
"name": "david_training",
|
| 3 |
"run_id": "20251012_060013",
|
| 4 |
"dataset_name": "AbstractPhil/imagenet-clip-features-orderly",
|
| 5 |
+
"model_variant": "clip_vit_l14",
|
| 6 |
"num_classes": 1000,
|
| 7 |
+
"preset": "clip_vit_l14_deep",
|
| 8 |
"custom_config_path": null,
|
| 9 |
"num_classes_override": null,
|
| 10 |
"use_belly_override": null,
|
| 11 |
"belly_expand_override": null,
|
| 12 |
+
"progressive_training_override": true,
|
| 13 |
+
"scale_warmup_epochs_override": {
|
| 14 |
+
"256": 0,
|
| 15 |
+
"512": 1,
|
| 16 |
+
"768": 2,
|
| 17 |
+
"1024": 3,
|
| 18 |
+
"1280": 4,
|
| 19 |
+
"1536": 5,
|
| 20 |
+
"1792": 6,
|
| 21 |
+
"2048": 7,
|
| 22 |
+
"2304": 8,
|
| 23 |
+
"2560": 9
|
| 24 |
+
},
|
| 25 |
"num_epochs": 10,
|
| 26 |
"batch_size": 1024,
|
| 27 |
+
"learning_rate": 0.001,
|
| 28 |
"weight_decay": 1e-05,
|
| 29 |
"warmup_epochs": 0,
|
| 30 |
"use_rose_loss": true,
|