Update models for DSGDm-8-complete
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- .gitattributes +24 -0
- DSGDm-8-complete/3396810/checkpoints/model_45.pt +3 -0
- DSGDm-8-complete/3396810/checkpoints/model_90.pt +3 -0
- DSGDm-8-complete/3396810/data_cfg.dump.toml +14 -0
- DSGDm-8-complete/3396810/data_cfg.toml +16 -0
- DSGDm-8-complete/3396810/err.out +160 -0
- DSGDm-8-complete/3396810/log.out +0 -0
- DSGDm-8-complete/3396810/tb_trace/worker_00.1736803679700367017.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/tb_trace/worker_01.1736803679694837989.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/tb_trace/worker_02.1736803679700306866.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/tb_trace/worker_03.1736803679701051019.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/tb_trace/worker_04.1736803679699263562.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/tb_trace/worker_05.1736803679694488849.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/tb_trace/worker_06.1736803679698583528.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/tb_trace/worker_07.1736803679698570448.pt.trace.json +3 -0
- DSGDm-8-complete/3396810/test_results.csv +3 -0
- DSGDm-8-complete/3396810/train_cfg.dump.toml +50 -0
- DSGDm-8-complete/3396810/train_cfg.toml +36 -0
- DSGDm-8-complete/3396810/train_log.csv +91 -0
- DSGDm-8-complete/3396814/checkpoints/model_45.pt +3 -0
- DSGDm-8-complete/3396814/checkpoints/model_90.pt +3 -0
- DSGDm-8-complete/3396814/data_cfg.dump.toml +14 -0
- DSGDm-8-complete/3396814/data_cfg.toml +16 -0
- DSGDm-8-complete/3396814/err.out +218 -0
- DSGDm-8-complete/3396814/log.out +0 -0
- DSGDm-8-complete/3396814/tb_trace/worker_00.1736803741716515488.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/tb_trace/worker_01.1736803741715256737.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/tb_trace/worker_02.1736803741714535034.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/tb_trace/worker_03.1736803741708035584.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/tb_trace/worker_04.1736803741713051100.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/tb_trace/worker_05.1736803741713040608.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/tb_trace/worker_06.1736803741713039723.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/tb_trace/worker_07.1736803741713054925.pt.trace.json +3 -0
- DSGDm-8-complete/3396814/test_results.csv +3 -0
- DSGDm-8-complete/3396814/train_cfg.dump.toml +50 -0
- DSGDm-8-complete/3396814/train_cfg.toml +36 -0
- DSGDm-8-complete/3396814/train_log.csv +91 -0
- DSGDm-8-complete/3396815/checkpoints/model_45.pt +3 -0
- DSGDm-8-complete/3396815/checkpoints/model_90.pt +3 -0
- DSGDm-8-complete/3396815/data_cfg.dump.toml +14 -0
- DSGDm-8-complete/3396815/data_cfg.toml +16 -0
- DSGDm-8-complete/3396815/err.out +161 -0
- DSGDm-8-complete/3396815/log.out +0 -0
- DSGDm-8-complete/3396815/tb_trace/worker_00.1736803740196062136.pt.trace.json +3 -0
- DSGDm-8-complete/3396815/tb_trace/worker_01.1736803740195517240.pt.trace.json +3 -0
- DSGDm-8-complete/3396815/tb_trace/worker_02.1736803740198629006.pt.trace.json +3 -0
- DSGDm-8-complete/3396815/tb_trace/worker_03.1736803740196019080.pt.trace.json +3 -0
- DSGDm-8-complete/3396815/tb_trace/worker_04.1736803740196645296.pt.trace.json +3 -0
- DSGDm-8-complete/3396815/tb_trace/worker_05.1736803740196646399.pt.trace.json +3 -0
- DSGDm-8-complete/3396815/tb_trace/worker_06.1736803740196720663.pt.trace.json +3 -0
.gitattributes
CHANGED
|
@@ -69,3 +69,27 @@ DSGDm-8-ring/3396800/tb_trace/worker_04.1736802649422631598.pt.trace.json filter
|
|
| 69 |
DSGDm-8-ring/3396800/tb_trace/worker_05.1736802649420739266.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 70 |
DSGDm-8-ring/3396800/tb_trace/worker_06.1736802649437161614.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 71 |
DSGDm-8-ring/3396800/tb_trace/worker_07.1736802649437846610.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
DSGDm-8-ring/3396800/tb_trace/worker_05.1736802649420739266.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 70 |
DSGDm-8-ring/3396800/tb_trace/worker_06.1736802649437161614.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 71 |
DSGDm-8-ring/3396800/tb_trace/worker_07.1736802649437846610.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
DSGDm-8-complete/3396810/tb_trace/worker_00.1736803679700367017.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
DSGDm-8-complete/3396810/tb_trace/worker_01.1736803679694837989.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
DSGDm-8-complete/3396810/tb_trace/worker_02.1736803679700306866.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
DSGDm-8-complete/3396810/tb_trace/worker_03.1736803679701051019.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
DSGDm-8-complete/3396810/tb_trace/worker_04.1736803679699263562.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
DSGDm-8-complete/3396810/tb_trace/worker_05.1736803679694488849.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
DSGDm-8-complete/3396810/tb_trace/worker_06.1736803679698583528.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
DSGDm-8-complete/3396810/tb_trace/worker_07.1736803679698570448.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
DSGDm-8-complete/3396814/tb_trace/worker_00.1736803741716515488.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
DSGDm-8-complete/3396814/tb_trace/worker_01.1736803741715256737.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
DSGDm-8-complete/3396814/tb_trace/worker_02.1736803741714535034.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
DSGDm-8-complete/3396814/tb_trace/worker_03.1736803741708035584.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
DSGDm-8-complete/3396814/tb_trace/worker_04.1736803741713051100.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
DSGDm-8-complete/3396814/tb_trace/worker_05.1736803741713040608.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
DSGDm-8-complete/3396814/tb_trace/worker_06.1736803741713039723.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
DSGDm-8-complete/3396814/tb_trace/worker_07.1736803741713054925.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
DSGDm-8-complete/3396815/tb_trace/worker_00.1736803740196062136.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
DSGDm-8-complete/3396815/tb_trace/worker_01.1736803740195517240.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
DSGDm-8-complete/3396815/tb_trace/worker_02.1736803740198629006.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
DSGDm-8-complete/3396815/tb_trace/worker_03.1736803740196019080.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
DSGDm-8-complete/3396815/tb_trace/worker_04.1736803740196645296.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
DSGDm-8-complete/3396815/tb_trace/worker_05.1736803740196646399.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
DSGDm-8-complete/3396815/tb_trace/worker_06.1736803740196720663.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
DSGDm-8-complete/3396815/tb_trace/worker_07.1736803740196718891.pt.trace.json filter=lfs diff=lfs merge=lfs -text
|
DSGDm-8-complete/3396810/checkpoints/model_45.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95bcc99e642ccb43936d3df59ad3a9bbf7bf6e077135484f1aeddb493423fc9d
|
| 3 |
+
size 102518166
|
DSGDm-8-complete/3396810/checkpoints/model_90.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98b5a3006e6f3d99def1f35e3e9e873c55bc4f69df3cebe886cdf83d63ce02c1
|
| 3 |
+
size 102518166
|
DSGDm-8-complete/3396810/data_cfg.dump.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_dir = "./data/Imagenet"
|
| 2 |
+
num_classes = 1000
|
| 3 |
+
|
| 4 |
+
[dataloader]
|
| 5 |
+
name = "ffcv"
|
| 6 |
+
processed_data_dir = "./data/ffcv"
|
| 7 |
+
max_resolution = 500
|
| 8 |
+
compress_probability = 1.0
|
| 9 |
+
jpeg_quality = 90
|
| 10 |
+
num_data_workers = 12
|
| 11 |
+
in_memory = true
|
| 12 |
+
tag = "ffcv_500_1.000_90"
|
| 13 |
+
train_data_dir = "./data/ffcv/ffcv_500_1.000_90_train.ffcv"
|
| 14 |
+
val_data_dir = "./data/ffcv/ffcv_500_1.000_90_val.ffcv"
|
DSGDm-8-complete/3396810/data_cfg.toml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_dir = "./data/Imagenet"
|
| 2 |
+
num_classes = 1000
|
| 3 |
+
|
| 4 |
+
[dataloader]
|
| 5 |
+
name = "ffcv"
|
| 6 |
+
processed_data_dir = "./data/ffcv"
|
| 7 |
+
max_resolution = 500
|
| 8 |
+
compress_probability = 1.0
|
| 9 |
+
jpeg_quality = 90
|
| 10 |
+
num_data_workers = 12
|
| 11 |
+
|
| 12 |
+
# [dataloader]
|
| 13 |
+
# name = "dali"
|
| 14 |
+
# preload = true
|
| 15 |
+
# sharded_data_dir = "./data/Imagenet-sharded"
|
| 16 |
+
# num_data_workers = 8
|
DSGDm-8-complete/3396810/err.out
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793]
|
| 2 |
+
W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793] *****************************************
|
| 3 |
+
W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793] *****************************************
|
| 5 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
|
| 6 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
|
| 7 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
|
| 8 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
|
| 9 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
|
| 10 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] run_id : 30041
|
| 11 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
|
| 12 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.161:28052
|
| 13 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
|
| 14 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
|
| 15 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
|
| 16 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396810/torchelastic_brzz4zhi
|
| 17 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
|
| 18 |
+
I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194]
|
| 19 |
+
I0113 22:26:26.357000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
|
| 20 |
+
I0113 22:26:26.358000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
|
| 21 |
+
W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793]
|
| 22 |
+
W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793] *****************************************
|
| 23 |
+
W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 24 |
+
W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793] *****************************************
|
| 25 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
|
| 26 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
|
| 27 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
|
| 28 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
|
| 29 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
|
| 30 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] run_id : 30041
|
| 31 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
|
| 32 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.161:28052
|
| 33 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
|
| 34 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
|
| 35 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
|
| 36 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396810/torchelastic_1cs_tjn0
|
| 37 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
|
| 38 |
+
I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194]
|
| 39 |
+
I0113 22:26:27.643000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
|
| 40 |
+
I0113 22:26:27.644000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
|
| 41 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
|
| 42 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
|
| 43 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.161
|
| 44 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
|
| 45 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=0
|
| 46 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
|
| 47 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
|
| 48 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[0, 1, 2, 3]
|
| 49 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[0, 1, 2, 3]
|
| 50 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
|
| 51 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
|
| 52 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525]
|
| 53 |
+
I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
|
| 54 |
+
I0113 22:26:28.628000 4536 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
|
| 55 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
|
| 56 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
|
| 57 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.161
|
| 58 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
|
| 59 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=1
|
| 60 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
|
| 61 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
|
| 62 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[4, 5, 6, 7]
|
| 63 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[4, 5, 6, 7]
|
| 64 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
|
| 65 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
|
| 66 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525]
|
| 67 |
+
I0113 22:26:28.628000 4536 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
|
| 68 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
|
| 69 |
+
I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
|
| 70 |
+
I0113 22:26:28.629000 4536 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
|
| 71 |
+
I0113 22:26:28.629000 4457 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
|
| 72 |
+
I0113 22:26:28.629000 4457 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
|
| 73 |
+
[rank1]:[W113 22:26:44.646740965 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 74 |
+
[rank3]:[W113 22:26:44.646748763 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 75 |
+
[rank0]:[W113 22:26:44.646756097 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 76 |
+
[rank2]:[W113 22:26:44.646843940 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 77 |
+
[rank4]:[W113 22:26:46.899645566 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 78 |
+
[rank7]:[W113 22:26:46.900202635 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 79 |
+
[rank6]:[W113 22:26:46.900424479 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 80 |
+
[rank5]:[W113 22:26:46.900872526 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 81 |
+
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
|
| 82 |
+
wandb: Currently logged in as: zesen. Use `wandb login --relogin` to force relogin
|
| 83 |
+
wandb: Tracking run with wandb version 0.19.1
|
| 84 |
+
wandb: Run data is saved locally in /local/tmp.3396810/wandb/run-20250113_222653-b5c2ab3l
|
| 85 |
+
wandb: Run `wandb offline` to turn off syncing.
|
| 86 |
+
wandb: Syncing run 3396810
|
| 87 |
+
wandb: βοΈ View project at https://wandb.ai/zesen/decent-sam
|
| 88 |
+
wandb: π View run at https://wandb.ai/zesen/decent-sam/runs/b5c2ab3l
|
| 89 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 90 |
+
warnings.warn(
|
| 91 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 92 |
+
warnings.warn(
|
| 93 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 94 |
+
warnings.warn(
|
| 95 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 96 |
+
warnings.warn(
|
| 97 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 98 |
+
warnings.warn(
|
| 99 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 100 |
+
warnings.warn(
|
| 101 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 102 |
+
warnings.warn(
|
| 103 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 104 |
+
warnings.warn(
|
| 105 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 106 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 107 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 108 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 109 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 110 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 111 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 112 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 113 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 114 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 115 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 116 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 117 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 118 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 119 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 120 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 121 |
+
wandb:
|
| 122 |
+
[rank3]:[W114 00:52:45.077889515 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 123 |
+
wandb:
|
| 124 |
+
wandb: Run history:
|
| 125 |
+
wandb: epoch βββββββββββββββββββββ
β
β
β
β
β
ββββββββββββββ
|
| 126 |
+
wandb: epoch_train_time ββββββββββββββββββββββββββββββββββββββββ
|
| 127 |
+
wandb: loss ββ
β
βββββββββββββββββββββββββββββββββββββ
|
| 128 |
+
wandb: lr βββββ
ββββββββββββββββββ
β
β
βββββββββββββββ
|
| 129 |
+
wandb: total_train_time ββββββββββββββββββββββ
β
β
β
β
β
β
β
β
ββββββββββ
|
| 130 |
+
wandb: val_acc1 ββββ
β
β
ββββββββββββββββββββββββββββββββββ
|
| 131 |
+
wandb: val_acc5 ββββββββββββββββββββββββββββββββββββββββ
|
| 132 |
+
wandb: val_loss βββ
β
ββββββββββββββββββββββββββββββββββββ
|
| 133 |
+
wandb:
|
| 134 |
+
wandb: Run summary:
|
| 135 |
+
wandb: epoch 90
|
| 136 |
+
wandb: epoch_train_time 92.1866
|
| 137 |
+
wandb: loss 1.75573
|
| 138 |
+
wandb: lr 1e-05
|
| 139 |
+
wandb: total_train_time 8481.97145
|
| 140 |
+
wandb: val_acc1 77.52
|
| 141 |
+
wandb: val_acc5 93.644
|
| 142 |
+
wandb: val_loss 1.95997
|
| 143 |
+
wandb:
|
| 144 |
+
[rank6]:[W114 00:52:46.212017408 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 145 |
+
[rank1]:[W114 00:52:46.299757777 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 146 |
+
[rank7]:[W114 00:52:46.318365775 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 147 |
+
[rank2]:[W114 00:52:46.362530996 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 148 |
+
[rank5]:[W114 00:52:46.355965645 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 149 |
+
[rank4]:[W114 00:52:46.356968285 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 150 |
+
wandb: π View run 3396810 at: https://wandb.ai/zesen/decent-sam/runs/b5c2ab3l
|
| 151 |
+
wandb: βοΈ View project at: https://wandb.ai/zesen/decent-sam
|
| 152 |
+
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
|
| 153 |
+
wandb: Find logs at: /local/tmp.3396810/wandb/run-20250113_222653-b5c2ab3l/logs
|
| 154 |
+
[rank0]:[W114 00:52:49.322995684 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 155 |
+
I0114 00:52:56.487000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
|
| 156 |
+
I0114 00:52:56.489000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
|
| 157 |
+
I0114 00:53:01.055000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
|
| 158 |
+
I0114 00:53:01.059000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
|
| 159 |
+
I0114 00:53:01.060000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.0008060932159423828 seconds
|
| 160 |
+
I0114 00:53:01.061000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 4.571002244949341 seconds
|
DSGDm-8-complete/3396810/log.out
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
DSGDm-8-complete/3396810/tb_trace/worker_00.1736803679700367017.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28868768a79899aeb44d84454a68bb24e4aa65dd94086a1b4a3079bb0ec0973a
|
| 3 |
+
size 15106886
|
DSGDm-8-complete/3396810/tb_trace/worker_01.1736803679694837989.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a16ed4e388c73efea1694f3533f2598cb83ccb4078b75ea2454acadf864cc88
|
| 3 |
+
size 15018765
|
DSGDm-8-complete/3396810/tb_trace/worker_02.1736803679700306866.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b0f1778fc24a2e2101452d9bec39462dbd3ece9206cfd28c75e2b6f29ae7647
|
| 3 |
+
size 15068964
|
DSGDm-8-complete/3396810/tb_trace/worker_03.1736803679701051019.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fb16e2208b3dc4473ecc9ae2820092a6be5ae09ec98c603fb4552b2f4be301f
|
| 3 |
+
size 14998443
|
DSGDm-8-complete/3396810/tb_trace/worker_04.1736803679699263562.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28a2aacea07d17081b43b5d9672959b9ad1592ad99a73ace4b2faa3b098c905e
|
| 3 |
+
size 15038253
|
DSGDm-8-complete/3396810/tb_trace/worker_05.1736803679694488849.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52d25b7ed8450330e76ce6f7fc8c4724de57cd25cc1e39c98840bacd1ad13d5e
|
| 3 |
+
size 15159249
|
DSGDm-8-complete/3396810/tb_trace/worker_06.1736803679698583528.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f9897774b01c14a46f5c95e794793d26ac33c77737753f882c5a6d1474110cb
|
| 3 |
+
size 15033494
|
DSGDm-8-complete/3396810/tb_trace/worker_07.1736803679698570448.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:449570c66b5d288c3e8742b63837abdbd235376466ccb335d9478c6153f8ed85
|
| 3 |
+
size 15086969
|
DSGDm-8-complete/3396810/test_results.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
epoch,val_loss,val_acc1,val_acc5,val_samples
|
| 2 |
+
45.0,2.2337640564727783,68.86999997314453,89.11200002441406,50000.0
|
| 3 |
+
90.0,1.9381773719406128,77.48400000976562,93.65199993896485,50000.0
|
DSGDm-8-complete/3396810/train_cfg.dump.toml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 1024
|
| 2 |
+
max_epochs = 90
|
| 3 |
+
lr = 1.0
|
| 4 |
+
label_smoothing = 0.1
|
| 5 |
+
grad_clip_norm = 0.0
|
| 6 |
+
checkpoint_dir = ""
|
| 7 |
+
arch = "resnet50"
|
| 8 |
+
use_amp = true
|
| 9 |
+
num_samples_for_stats = 102400
|
| 10 |
+
batch_size_per_local_batch = 128
|
| 11 |
+
|
| 12 |
+
[backend]
|
| 13 |
+
name = "decent-dp"
|
| 14 |
+
topology = "complete"
|
| 15 |
+
|
| 16 |
+
[preprocess]
|
| 17 |
+
preload_local = true
|
| 18 |
+
interpolation = "bilinear"
|
| 19 |
+
train_crop_size = 176
|
| 20 |
+
val_image_size = 256
|
| 21 |
+
val_crop_size = 224
|
| 22 |
+
|
| 23 |
+
[optim]
|
| 24 |
+
name = "sgd"
|
| 25 |
+
weight_decay = 3.0517578125e-05
|
| 26 |
+
momentum = 0.875
|
| 27 |
+
|
| 28 |
+
[lr_scheduler]
|
| 29 |
+
name = "cosine"
|
| 30 |
+
warmup_epochs = 5
|
| 31 |
+
warmup_decay = 0.01
|
| 32 |
+
eta_min = 1e-05
|
| 33 |
+
|
| 34 |
+
[reproduce]
|
| 35 |
+
seed = 810976
|
| 36 |
+
|
| 37 |
+
[log]
|
| 38 |
+
log_freq = 100
|
| 39 |
+
wandb_on = true
|
| 40 |
+
wandb_project = "decent-sam"
|
| 41 |
+
checkpoint_freq = 45
|
| 42 |
+
job_id = "3396810"
|
| 43 |
+
log_dir = "/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396810"
|
| 44 |
+
|
| 45 |
+
[network]
|
| 46 |
+
world_size = 8
|
| 47 |
+
rank = 0
|
| 48 |
+
local_rank = 0
|
| 49 |
+
local_world_size = 4
|
| 50 |
+
node_list = "alvis7-[09-10]"
|
DSGDm-8-complete/3396810/train_cfg.toml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 1024
|
| 2 |
+
max_epochs = 90
|
| 3 |
+
lr = 1.0
|
| 4 |
+
label_smoothing = 0.1
|
| 5 |
+
arch = "resnet50"
|
| 6 |
+
use_amp = true
|
| 7 |
+
|
| 8 |
+
[backend]
|
| 9 |
+
name = 'decent-dp'
|
| 10 |
+
topology = 'complete'
|
| 11 |
+
|
| 12 |
+
[preprocess]
|
| 13 |
+
preload_local = true
|
| 14 |
+
interpolation = "bilinear"
|
| 15 |
+
train_crop_size = 176
|
| 16 |
+
val_image_size = 256
|
| 17 |
+
val_crop_size = 224
|
| 18 |
+
|
| 19 |
+
[optim]
|
| 20 |
+
name = 'sgd'
|
| 21 |
+
momentum = 0.875
|
| 22 |
+
weight_decay = 0.000030517578125
|
| 23 |
+
|
| 24 |
+
[lr_scheduler]
|
| 25 |
+
name = 'cosine'
|
| 26 |
+
warmup_epochs = 5
|
| 27 |
+
warmup_decay = 0.01
|
| 28 |
+
|
| 29 |
+
[reproduce]
|
| 30 |
+
seed = 810976
|
| 31 |
+
|
| 32 |
+
[log]
|
| 33 |
+
log_freq = 100
|
| 34 |
+
wandb_on = true
|
| 35 |
+
wandb_project = "decent-sam"
|
| 36 |
+
checkpoint_freq = 45
|
DSGDm-8-complete/3396810/train_log.csv
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
epoch,step,train_loss,val_loss,val_acc1,val_acc5,time,checkpoint_dir
|
| 2 |
+
1,1251,6.115497382234135,5.49630715549469,8.321999994430541,22.581999992523194,178.20069289207458,
|
| 3 |
+
2,2502,4.902603116562422,4.38631671503067,23.0619999798584,46.49800000289917,272.038290977478,
|
| 4 |
+
3,3753,4.195299116017626,3.9677681756305696,33.50599998046875,59.47599997314453,364.5414605140686,
|
| 5 |
+
4,5004,3.8045895575857656,3.4839741384124756,40.48199998336792,66.93999997497559,456.93470644950867,
|
| 6 |
+
5,6255,3.587492471595081,3.3035725147724153,45.679999972076416,71.42799997711181,549.1559157371521,
|
| 7 |
+
6,7506,3.4111864673052663,3.252914929189682,46.39999997375488,72.17200000518798,641.4471440315247,
|
| 8 |
+
7,8757,3.2772125174387465,3.0007691029167174,51.16199997344971,76.56999998840332,733.5894980430603,
|
| 9 |
+
8,10008,3.1858509929536534,2.944188568649292,52.62199997467041,77.4099999874878,825.8874089717865,
|
| 10 |
+
9,11259,3.114799227526815,3.0860951818275453,49.53599997512817,74.72599998565674,918.1725902557373,
|
| 11 |
+
10,12510,3.0601791883353515,2.801905057754517,55.79999997833252,80.17799998168945,1010.4640154838562,
|
| 12 |
+
11,13761,3.0145340088031274,2.8203145016384124,55.521999955444336,80.04599997375489,1110.194946527481,
|
| 13 |
+
12,15012,2.9810065702854587,2.818964365930557,55.29199998260498,79.78199998138427,1202.1528532505035,
|
| 14 |
+
13,16263,2.9469908509466,2.7131245301103593,57.641999983825684,81.55200000152588,1294.0937585830688,
|
| 15 |
+
14,17514,2.9168076822750097,2.696461324515343,58.14199997253418,81.8919999798584,1393.63254904747,
|
| 16 |
+
15,18765,2.8954941391659013,2.7241055217075347,57.52800000976563,81.48799998718262,1485.6434042453766,
|
| 17 |
+
16,20016,2.872533181111971,2.6555525961971282,59.35599998443603,82.98799996643066,1577.6670188903809,
|
| 18 |
+
17,21267,2.8503478197790355,2.698758298025131,58.60799997528076,82.30599999786376,1677.0858178138733,
|
| 19 |
+
18,22518,2.832455455828056,2.673398114233017,58.705999983215335,82.3939999822998,1768.8874039649963,
|
| 20 |
+
19,23769,2.816942418341061,2.596034429960251,60.95199998413086,84.24199998535157,1860.7363867759705,
|
| 21 |
+
20,25020,2.7986710402938866,2.5837205569553374,61.309999973754884,84.33599998016358,1952.6566116809845,
|
| 22 |
+
21,26271,2.7846179530441426,2.7024088632774355,58.051999953308105,81.90399998657226,2044.8731739521027,
|
| 23 |
+
22,27522,2.769252215715335,2.611931260843277,60.715999960327146,83.91599997741699,2137.1224229335785,
|
| 24 |
+
23,28773,2.7585869756200427,2.5537972654533387,62.10599998229981,84.76799998840332,2229.266894340515,
|
| 25 |
+
24,30024,2.7435707499940905,2.557687119922638,61.66799997253418,84.59399999664306,2321.205994606018,
|
| 26 |
+
25,31275,2.7306130466272505,2.6683655838346483,59.56999998657226,82.65999997741699,2413.2169272899628,
|
| 27 |
+
26,32526,2.71709777485076,2.524520060300827,62.71799997314453,85.43799997253419,2505.2413029670715,
|
| 28 |
+
27,33777,2.7053595260321663,2.5170880539512632,62.23199997543335,84.88599996887207,2604.7841408252716,
|
| 29 |
+
28,35028,2.6954730913984024,2.5253023421144487,61.97999997650147,84.99799997222901,2696.6729650497437,
|
| 30 |
+
29,36279,2.6833848688575768,2.5719702689266204,61.5839999887085,84.5299999935913,2788.51065325737,
|
| 31 |
+
30,37530,2.670573327038214,2.5124143429517747,63.17399995239258,85.64599999938964,2887.899392604828,
|
| 32 |
+
31,38781,2.6598851650977116,2.511490090098381,63.63800001159668,85.55999996459961,2979.737328529358,
|
| 33 |
+
32,40032,2.646365909434432,2.539848774638176,62.865999978942874,85.07999997406006,3071.771167039871,
|
| 34 |
+
33,41283,2.63584532190284,2.4440039260196684,64.55799997772216,86.51999998657226,3171.4136126041412,
|
| 35 |
+
34,42534,2.6246873265166553,2.4617872117948534,64.07999997589111,86.01999999053955,3263.3756012916565,
|
| 36 |
+
35,43785,2.6102599474928265,2.495519027967453,64.0659999935913,86.00200000488282,3355.366861104965,
|
| 37 |
+
36,45036,2.600696023348138,2.4298770931959153,64.75199996795654,86.64999998535156,3447.217997074127,
|
| 38 |
+
37,46287,2.5888351397358065,2.4921805786609648,63.15599997375488,85.52399997772217,3539.190637111664,
|
| 39 |
+
38,47538,2.5782451668708064,2.465635354757309,64.35399996612549,86.52400001312256,3631.2022793293,
|
| 40 |
+
39,48789,2.566590094547287,2.3938324451351165,65.39399996826172,86.99999999664307,3723.1156182289124,
|
| 41 |
+
40,50040,2.5534790206632074,2.3881699456834795,65.55799998321534,86.99199998596191,3815.030925989151,
|
| 42 |
+
41,51291,2.540958863058441,2.377643189086914,66.41799998596191,87.78799998138427,3906.896488904953,
|
| 43 |
+
42,52542,2.52921495210829,2.370751012825966,66.04199999267578,87.65399995941162,3998.8248476982117,
|
| 44 |
+
43,53793,2.5176791096333977,2.447054692735672,65.20199995361328,86.69199997528077,4098.377296209335,
|
| 45 |
+
44,55044,2.5004122752985127,2.3770597540855407,66.16999997436524,87.63999998382569,4190.288501262665,
|
| 46 |
+
45,56295,2.4915684670996034,2.3904737115240096,65.55999998291016,86.99599998870849,4282.123118877411,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396810/checkpoints/model_45.pt
|
| 47 |
+
46,57546,2.4768516317927105,2.370375301232338,66.6199999911499,87.54799998901368,4381.528468608856,
|
| 48 |
+
47,58797,2.462672047906642,2.3097496503448487,67.64400000366211,88.34999995605469,4473.578360557556,
|
| 49 |
+
48,60048,2.450321657134951,2.291786767001152,68.1439999621582,88.5859999545288,4565.646231889725,
|
| 50 |
+
49,61299,2.4367442497913596,2.414863888645172,65.57199995544434,87.01399996490478,4665.047325372696,
|
| 51 |
+
50,62550,2.421720023456809,2.3640809498119353,66.9279999847412,87.52200000518799,4756.795320510864,
|
| 52 |
+
51,63801,2.409095025081619,2.337284747476578,67.53199996368409,88.26999998565674,4848.593809604645,
|
| 53 |
+
52,65052,2.39349576933302,2.3086278829574587,67.99199998260498,88.29399997283936,4940.544646501541,
|
| 54 |
+
53,66303,2.379139742929396,2.28750909260273,68.62399998016357,88.74199997589112,5032.606946706772,
|
| 55 |
+
54,67554,2.3633363546727657,2.331979500384331,67.50999998596191,88.1219999597168,5124.647661447525,
|
| 56 |
+
55,68805,2.3463351102136403,2.2553140204572677,69.30599996551514,89.40599997070312,5216.527634859085,
|
| 57 |
+
56,70056,2.331912900642049,2.2343211260557174,69.49999996795654,89.48799995758057,5308.56108379364,
|
| 58 |
+
57,71307,2.3144515739928044,2.233425653114319,69.98399998718261,89.68799997314453,5400.826423168182,
|
| 59 |
+
58,72558,2.298989494939407,2.267748852825165,69.11199995941162,89.16999995208741,5493.116752147675,
|
| 60 |
+
59,73809,2.2821754796041858,2.321332974061966,67.68799996368408,88.15399998382568,5592.708019256592,
|
| 61 |
+
60,75060,2.266402605745337,2.233510379371643,70.10800000549317,89.59599996002197,5684.661290645599,
|
| 62 |
+
61,76311,2.248882063477636,2.269366139588356,69.6219999621582,89.49599996795655,5776.508955955505,
|
| 63 |
+
62,77562,2.2304583841638506,2.1631349580574035,71.61999999816895,90.65399994934081,5875.927084207535,
|
| 64 |
+
63,78813,2.210439218022078,2.1846533494901657,71.30799998931884,90.24399996520997,5967.6671352386475,
|
| 65 |
+
64,80064,2.1924270626476154,2.20357965007782,70.41599999725342,89.83599999725342,6059.495954990387,
|
| 66 |
+
65,81315,2.1723172299796154,2.14432814950943,72.68599997406005,90.9159999710083,6158.839286565781,
|
| 67 |
+
66,82566,2.1546609271034822,2.136791229815483,72.47399998474121,91.01799998413085,6250.693339586258,
|
| 68 |
+
67,83817,2.1353299207657837,2.1841600111722945,71.4879999734497,90.48399996826171,6342.555477380753,
|
| 69 |
+
68,85068,2.113898727581274,2.1171528541707993,73.22599997680663,91.27599993896484,6434.511804103851,
|
| 70 |
+
69,86319,2.0941642940663794,2.08480306992054,73.80799997406005,91.69799995788574,6526.674050331116,
|
| 71 |
+
70,87570,2.0748744738473595,2.1041335141468047,73.37799997406006,91.47399996032715,6618.951231479645,
|
| 72 |
+
71,88821,2.05246179085746,2.097317939052582,73.63399998199463,91.635999944458,6711.154579639435,
|
| 73 |
+
72,90072,2.0299681932162894,2.114106169629097,73.26199998168946,91.47999996612549,6803.397469758987,
|
| 74 |
+
73,91323,2.0102840042371546,2.05411882917881,74.55999997680664,92.17199994445801,6895.66441822052,
|
| 75 |
+
74,92574,1.988244648460004,2.0506596167564393,74.5999999899292,92.165999949646,6987.938447475433,
|
| 76 |
+
75,93825,1.9637918177959348,2.046655902919769,74.71800000335693,92.22999994140625,7087.6016981601715,
|
| 77 |
+
76,95076,1.946619587610189,2.016001330356598,75.63199997650146,92.66799994415283,7179.433463811874,
|
| 78 |
+
77,96327,1.9239142328191052,1.9993775571107864,75.95200000396729,92.97799993591309,7271.2513654232025,
|
| 79 |
+
78,97578,1.9024203764639502,1.994896138973236,76.15399998260499,93.06599995788574,7370.6608374118805,
|
| 80 |
+
79,98829,1.8808876718048284,1.9910964307689667,76.37600000946045,93.05599993896485,7462.502587795258,
|
| 81 |
+
80,100080,1.8634167736430438,1.984393134250641,76.21399999053955,93.07199994445801,7554.465767860413,
|
| 82 |
+
81,101331,1.8463708090124655,1.9833151563167573,76.65400000427246,93.21199995513916,7654.044641971588,
|
| 83 |
+
82,102582,1.8278751979819494,1.9754268894052505,76.92800000457764,93.27399996063232,7745.87461400032,
|
| 84 |
+
83,103833,1.8130075675334862,1.9714242833566666,77.15600000457763,93.387999944458,7837.780626296997,
|
| 85 |
+
84,105084,1.7992366814761043,1.9707070908737183,77.13599999053955,93.515999944458,7929.781843185425,
|
| 86 |
+
85,106335,1.7906732545863333,1.9659925394535065,77.33200000976562,93.46599993621827,8021.87069940567,
|
| 87 |
+
86,107586,1.7811057284462462,1.9619173885440826,77.39200000152587,93.54999993896485,8113.900476932526,
|
| 88 |
+
87,108837,1.775649699673569,1.9636126394271851,77.32399999084473,93.56599993896485,8205.859763145447,
|
| 89 |
+
88,110088,1.7688615699585297,1.9585561914587022,77.37600000976562,93.6239999472046,8297.732298135757,
|
| 90 |
+
89,111339,1.766685530031137,1.9597701799440383,77.41200000701905,93.60799993896484,8389.784857749939,
|
| 91 |
+
90,112590,1.7652602999521008,1.9599693808031082,77.52000001251221,93.64399993896484,8481.97145485878,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396810/checkpoints/model_90.pt
|
DSGDm-8-complete/3396814/checkpoints/model_45.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbd5a664a9b970d7f93d79e7f0514c25952b4c71f4f0a88f55569c73664edfa9
|
| 3 |
+
size 102518166
|
DSGDm-8-complete/3396814/checkpoints/model_90.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f15cd9230781645d90afb246083c4ca22ce6b20cf9bebfb726ecd698c97acd60
|
| 3 |
+
size 102518166
|
DSGDm-8-complete/3396814/data_cfg.dump.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_dir = "./data/Imagenet"
|
| 2 |
+
num_classes = 1000
|
| 3 |
+
|
| 4 |
+
[dataloader]
|
| 5 |
+
name = "ffcv"
|
| 6 |
+
processed_data_dir = "./data/ffcv"
|
| 7 |
+
max_resolution = 500
|
| 8 |
+
compress_probability = 1.0
|
| 9 |
+
jpeg_quality = 90
|
| 10 |
+
num_data_workers = 12
|
| 11 |
+
in_memory = true
|
| 12 |
+
tag = "ffcv_500_1.000_90"
|
| 13 |
+
train_data_dir = "./data/ffcv/ffcv_500_1.000_90_train.ffcv"
|
| 14 |
+
val_data_dir = "./data/ffcv/ffcv_500_1.000_90_val.ffcv"
|
DSGDm-8-complete/3396814/data_cfg.toml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_dir = "./data/Imagenet"
|
| 2 |
+
num_classes = 1000
|
| 3 |
+
|
| 4 |
+
[dataloader]
|
| 5 |
+
name = "ffcv"
|
| 6 |
+
processed_data_dir = "./data/ffcv"
|
| 7 |
+
max_resolution = 500
|
| 8 |
+
compress_probability = 1.0
|
| 9 |
+
jpeg_quality = 90
|
| 10 |
+
num_data_workers = 12
|
| 11 |
+
|
| 12 |
+
# [dataloader]
|
| 13 |
+
# name = "dali"
|
| 14 |
+
# preload = true
|
| 15 |
+
# sharded_data_dir = "./data/Imagenet-sharded"
|
| 16 |
+
# num_data_workers = 8
|
DSGDm-8-complete/3396814/err.out
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793]
|
| 2 |
+
W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793] *****************************************
|
| 3 |
+
W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793] *****************************************
|
| 5 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
|
| 6 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
|
| 7 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
|
| 8 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
|
| 9 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
|
| 10 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] run_id : 26983
|
| 11 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
|
| 12 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.163:28052
|
| 13 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
|
| 14 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
|
| 15 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
|
| 16 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396814/torchelastic_xo1s6d9h
|
| 17 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
|
| 18 |
+
I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194]
|
| 19 |
+
W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793]
|
| 20 |
+
W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793] *****************************************
|
| 21 |
+
W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 22 |
+
W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793] *****************************************
|
| 23 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
|
| 24 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
|
| 25 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
|
| 26 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
|
| 27 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
|
| 28 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] run_id : 26983
|
| 29 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
|
| 30 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.163:28052
|
| 31 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
|
| 32 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
|
| 33 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
|
| 34 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396814/torchelastic__h0j47iu
|
| 35 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
|
| 36 |
+
I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194]
|
| 37 |
+
I0113 22:27:35.282000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
|
| 38 |
+
I0113 22:27:35.282000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
|
| 39 |
+
I0113 22:27:35.389000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
|
| 40 |
+
I0113 22:27:35.389000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
|
| 41 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
|
| 42 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
|
| 43 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.163
|
| 44 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
|
| 45 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=0
|
| 46 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
|
| 47 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
|
| 48 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[0, 1, 2, 3]
|
| 49 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[0, 1, 2, 3]
|
| 50 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
|
| 51 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
|
| 52 |
+
I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525]
|
| 53 |
+
I0113 22:27:36.409000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
|
| 54 |
+
I0113 22:27:36.409000 4639 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
|
| 55 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
|
| 56 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
|
| 57 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.163
|
| 58 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
|
| 59 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=1
|
| 60 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
|
| 61 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
|
| 62 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[4, 5, 6, 7]
|
| 63 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[4, 5, 6, 7]
|
| 64 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
|
| 65 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
|
| 66 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525]
|
| 67 |
+
I0113 22:27:36.410000 4639 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
|
| 68 |
+
I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
|
| 69 |
+
I0113 22:27:36.410000 4639 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
|
| 70 |
+
I0113 22:27:36.410000 4515 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
|
| 71 |
+
I0113 22:27:36.410000 4515 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
|
| 72 |
+
I0113 22:27:36.410000 4515 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
|
| 73 |
+
[rank0]:[W113 22:27:51.533565081 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 74 |
+
[rank1]:[W113 22:27:51.533956704 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 75 |
+
[rank3]:[W113 22:27:51.535096922 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 76 |
+
[rank2]:[W113 22:27:51.539950908 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 77 |
+
[rank5]:[W113 22:27:51.792502773 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 78 |
+
[rank6]:[W113 22:27:51.792731829 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 79 |
+
[rank4]:[W113 22:27:51.794218912 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 80 |
+
[rank7]:[W113 22:27:51.794605275 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 81 |
+
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
|
| 82 |
+
wandb: Currently logged in as: zesen. Use `wandb login --relogin` to force relogin
|
| 83 |
+
wandb: Tracking run with wandb version 0.19.1
|
| 84 |
+
wandb: Run data is saved locally in /local/tmp.3396814/wandb/run-20250113_222758-wdn6kgy3
|
| 85 |
+
wandb: Run `wandb offline` to turn off syncing.
|
| 86 |
+
wandb: Syncing run 3396814
|
| 87 |
+
wandb: βοΈ View project at https://wandb.ai/zesen/decent-sam
|
| 88 |
+
wandb: π View run at https://wandb.ai/zesen/decent-sam/runs/wdn6kgy3
|
| 89 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 90 |
+
warnings.warn(
|
| 91 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 92 |
+
warnings.warn(
|
| 93 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 94 |
+
warnings.warn(
|
| 95 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 96 |
+
warnings.warn(
|
| 97 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 98 |
+
warnings.warn(
|
| 99 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 100 |
+
warnings.warn(
|
| 101 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 102 |
+
warnings.warn(
|
| 103 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 104 |
+
warnings.warn(
|
| 105 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 106 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 107 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 108 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 109 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 110 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 111 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 112 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 113 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 114 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 115 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 116 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 117 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 118 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 119 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 120 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 121 |
+
wandb:
|
| 122 |
+
wandb:
|
| 123 |
+
wandb: Run history:
|
| 124 |
+
wandb: epoch ββββββββββββββββββββ
β
β
β
β
β
βββββββββββββββ
|
| 125 |
+
wandb: epoch_train_time ββββββββββββββββββββββββββββββββββββββββ
|
| 126 |
+
wandb: loss ββββββββββββββββββββββββββββββββββββββββ
|
| 127 |
+
wandb: lr ββββ
β
βββββββββββββ
β
β
ββββββββββββββββββββ
|
| 128 |
+
wandb: total_train_time βββββββββββββββββββββββ
β
β
β
ββββββββββββββ
|
| 129 |
+
wandb: val_acc1 βββ
β
β
β
β
β
ββββββββββββββββββββββββββββββββ
|
| 130 |
+
wandb: val_acc5 βββ
βββββββββββββββββββββββββββββββββββββ
|
| 131 |
+
wandb: val_loss ββ
ββββββββββββββββββββββββββββββββββββββ
|
| 132 |
+
wandb:
|
| 133 |
+
wandb: Run summary:
|
| 134 |
+
wandb: epoch 90
|
| 135 |
+
wandb: epoch_train_time 92.50205
|
| 136 |
+
wandb: loss 1.77242
|
| 137 |
+
wandb: lr 1e-05
|
| 138 |
+
wandb: total_train_time 8475.51776
|
| 139 |
+
wandb: val_acc1 77.444
|
| 140 |
+
wandb: val_acc5 93.642
|
| 141 |
+
wandb: val_loss 1.95482
|
| 142 |
+
wandb:
|
| 143 |
+
[rank1]:[W114 00:53:43.628969837 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 144 |
+
[rank2]:[W114 00:53:43.646098834 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 145 |
+
[rank4]:[W114 00:53:43.647502784 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 146 |
+
[rank7]:[W114 00:53:43.652868377 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 147 |
+
[rank5]:[W114 00:53:43.654237363 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 148 |
+
[rank6]:[W114 00:53:43.680114338 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 149 |
+
wandb: π View run 3396814 at: https://wandb.ai/zesen/decent-sam/runs/wdn6kgy3
|
| 150 |
+
wandb: βοΈ View project at: https://wandb.ai/zesen/decent-sam
|
| 151 |
+
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
|
| 152 |
+
wandb: Find logs at: /local/tmp.3396814/wandb/run-20250113_222758-wdn6kgy3/logs
|
| 153 |
+
[rank3]:[W114 00:53:43.809175777 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 154 |
+
[rank0]:[W114 00:53:46.656309667 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 155 |
+
I0114 00:53:53.886000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
|
| 156 |
+
I0114 00:53:53.889000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
|
| 157 |
+
I0114 00:53:59.386000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
|
| 158 |
+
I0114 00:53:59.390000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
|
| 159 |
+
I0114 00:53:59.392000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.0006356239318847656 seconds
|
| 160 |
+
I0114 00:53:59.392000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 5.502547264099121 seconds
|
| 161 |
+
[W114 00:53:59.606721821 TCPStore.cpp:131] [c10d] recvVector failed on SocketImpl(fd=4, addr=[alvis7-12.int.private]:59548, remote=[alvis7-11.int.private]:28052): failed to recv, got 0 bytes
|
| 162 |
+
Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:670 (most recent call first):
|
| 163 |
+
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x14f24b8ad446 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libc10.so)
|
| 164 |
+
frame #1: <unknown function> + 0x5fec818 (0x14f2868ef818 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
|
| 165 |
+
frame #2: <unknown function> + 0x5fece49 (0x14f2868efe49 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
|
| 166 |
+
frame #3: <unknown function> + 0x5fefd67 (0x14f2868f2d67 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
|
| 167 |
+
frame #4: c10d::TCPStore::compareSet(std::string const&, std::vector<unsigned char, std::allocator<unsigned char> > const&, std::vector<unsigned char, std::allocator<unsigned char> > const&) + 0x254 (0x14f2868ec5e4 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
|
| 168 |
+
frame #5: <unknown function> + 0xd79664 (0x14f2962e1664 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
|
| 169 |
+
frame #6: <unknown function> + 0x4cc1e3 (0x14f295a341e3 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
|
| 170 |
+
frame #7: <unknown function> + 0x224588 (0x56096c5b9588 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 171 |
+
frame #8: _PyObject_MakeTpCall + 0x2bb (0x56096c59975b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 172 |
+
frame #9: <unknown function> + 0x251777 (0x56096c5e6777 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 173 |
+
frame #10: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 174 |
+
frame #11: _PyObject_FastCallDictTstate + 0x1ee (0x56096c59c2fe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 175 |
+
frame #12: _PyObject_Call_Prepend + 0xe9 (0x56096c5c7739 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 176 |
+
frame #13: <unknown function> + 0x30364b (0x56096c69864b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 177 |
+
frame #14: _PyObject_Call + 0xb5 (0x56096c5ca135 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 178 |
+
frame #15: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 179 |
+
frame #16: PyEval_EvalCode + 0xa1 (0x56096c64f741 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 180 |
+
frame #17: <unknown function> + 0x2def1a (0x56096c673f1a in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 181 |
+
frame #18: <unknown function> + 0x2d9d35 (0x56096c66ed35 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 182 |
+
frame #19: <unknown function> + 0x2f2780 (0x56096c687780 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 183 |
+
frame #20: _PyRun_SimpleFileObject + 0x1ce (0x56096c686dfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 184 |
+
frame #21: _PyRun_AnyFileObject + 0x44 (0x56096c686ac4 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 185 |
+
frame #22: Py_RunMain + 0x2fe (0x56096c67fdfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 186 |
+
frame #23: Py_BytesMain + 0x37 (0x56096c63a0c7 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 187 |
+
frame #24: __libc_start_main + 0xe5 (0x14f297714d85 in /lib64/libc.so.6)
|
| 188 |
+
frame #25: <unknown function> + 0x2a4f71 (0x56096c639f71 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 189 |
+
|
| 190 |
+
W0114 00:53:59.426000 4515 site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1282] The node 'alvis7-12.int.private_4515_0' has failed to shutdown the rendezvous '26983' due to an error of type RendezvousConnectionError.
|
| 191 |
+
[W114 00:53:59.621070642 TCPStore.cpp:122] [c10d] sendBytes failed on SocketImpl(fd=4, addr=[alvis7-12.int.private]:59548, remote=[alvis7-11.int.private]:28052): Broken pipe
|
| 192 |
+
Exception raised from sendBytes at ../torch/csrc/distributed/c10d/Utils.hpp:645 (most recent call first):
|
| 193 |
+
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x14f24b8ad446 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libc10.so)
|
| 194 |
+
frame #1: <unknown function> + 0x5fecb29 (0x14f2868efb29 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
|
| 195 |
+
frame #2: c10d::TCPStore::compareSet(std::string const&, std::vector<unsigned char, std::allocator<unsigned char> > const&, std::vector<unsigned char, std::allocator<unsigned char> > const&) + 0x22d (0x14f2868ec5bd in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
|
| 196 |
+
frame #3: <unknown function> + 0xd79664 (0x14f2962e1664 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
|
| 197 |
+
frame #4: <unknown function> + 0x4cc1e3 (0x14f295a341e3 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
|
| 198 |
+
frame #5: <unknown function> + 0x224588 (0x56096c5b9588 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 199 |
+
frame #6: _PyObject_MakeTpCall + 0x2bb (0x56096c59975b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 200 |
+
frame #7: <unknown function> + 0x251777 (0x56096c5e6777 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 201 |
+
frame #8: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 202 |
+
frame #9: _PyObject_FastCallDictTstate + 0x1ee (0x56096c59c2fe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 203 |
+
frame #10: _PyObject_Call_Prepend + 0xe9 (0x56096c5c7739 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 204 |
+
frame #11: <unknown function> + 0x30364b (0x56096c69864b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 205 |
+
frame #12: _PyObject_Call + 0xb5 (0x56096c5ca135 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 206 |
+
frame #13: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 207 |
+
frame #14: PyEval_EvalCode + 0xa1 (0x56096c64f741 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 208 |
+
frame #15: <unknown function> + 0x2def1a (0x56096c673f1a in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 209 |
+
frame #16: <unknown function> + 0x2d9d35 (0x56096c66ed35 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 210 |
+
frame #17: <unknown function> + 0x2f2780 (0x56096c687780 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 211 |
+
frame #18: _PyRun_SimpleFileObject + 0x1ce (0x56096c686dfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 212 |
+
frame #19: _PyRun_AnyFileObject + 0x44 (0x56096c686ac4 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 213 |
+
frame #20: Py_RunMain + 0x2fe (0x56096c67fdfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 214 |
+
frame #21: Py_BytesMain + 0x37 (0x56096c63a0c7 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 215 |
+
frame #22: __libc_start_main + 0xe5 (0x14f297714d85 in /lib64/libc.so.6)
|
| 216 |
+
frame #23: <unknown function> + 0x2a4f71 (0x56096c639f71 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
|
| 217 |
+
|
| 218 |
+
W0114 00:53:59.431000 4515 site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1282] The node 'alvis7-12.int.private_4515_0' has failed to shutdown the rendezvous '26983' due to an error of type RendezvousConnectionError.
|
DSGDm-8-complete/3396814/log.out
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
DSGDm-8-complete/3396814/tb_trace/worker_00.1736803741716515488.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d867680757a0af1da55d1bb1b26e71571e5f33e51061116c8138855ae57b78b9
|
| 3 |
+
size 15061671
|
DSGDm-8-complete/3396814/tb_trace/worker_01.1736803741715256737.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fcb81cff5272a083ce11fb37bb154dbc20a8dd7e01f87d781c88ec68a4f3ca9a
|
| 3 |
+
size 15053264
|
DSGDm-8-complete/3396814/tb_trace/worker_02.1736803741714535034.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edf3ac376a3ee5e827c0066060ab79c5abd04700ab961ebb8103fdfb29c4a7a9
|
| 3 |
+
size 15106134
|
DSGDm-8-complete/3396814/tb_trace/worker_03.1736803741708035584.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5245069681853ea95ecbc6b0a55fbebe085f198e78ea06bb0f5d0cf8ae2d9312
|
| 3 |
+
size 15080397
|
DSGDm-8-complete/3396814/tb_trace/worker_04.1736803741713051100.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ec09c2cf4722fccf10c329c939c497480b3e46fc49bc6333cf55ed210bd7798
|
| 3 |
+
size 14997367
|
DSGDm-8-complete/3396814/tb_trace/worker_05.1736803741713040608.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83d7ba8c90eb93d926e058b361113252625044d61a01e9274838a98cd9bea799
|
| 3 |
+
size 15104925
|
DSGDm-8-complete/3396814/tb_trace/worker_06.1736803741713039723.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bed434ffdebf76928d8754ede222dd163baf3dd0a0a8e5960b3efc1681ee1f3
|
| 3 |
+
size 15118939
|
DSGDm-8-complete/3396814/tb_trace/worker_07.1736803741713054925.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e56f85f81b7fa09356f71da49848ae5214750307ec491539e80206aad2bd9dfd
|
| 3 |
+
size 15064239
|
DSGDm-8-complete/3396814/test_results.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
epoch,val_loss,val_acc1,val_acc5,val_samples
|
| 2 |
+
45.0,2.243170656890869,68.94799997314453,89.25399997314453,50000.0
|
| 3 |
+
90.0,1.9342285956573486,77.42600001464844,93.64999999023438,50000.0
|
DSGDm-8-complete/3396814/train_cfg.dump.toml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 1024
|
| 2 |
+
max_epochs = 90
|
| 3 |
+
lr = 1.0
|
| 4 |
+
label_smoothing = 0.1
|
| 5 |
+
grad_clip_norm = 0.0
|
| 6 |
+
checkpoint_dir = ""
|
| 7 |
+
arch = "resnet50"
|
| 8 |
+
use_amp = true
|
| 9 |
+
num_samples_for_stats = 102400
|
| 10 |
+
batch_size_per_local_batch = 128
|
| 11 |
+
|
| 12 |
+
[backend]
|
| 13 |
+
name = "decent-dp"
|
| 14 |
+
topology = "complete"
|
| 15 |
+
|
| 16 |
+
[preprocess]
|
| 17 |
+
preload_local = true
|
| 18 |
+
interpolation = "bilinear"
|
| 19 |
+
train_crop_size = 176
|
| 20 |
+
val_image_size = 256
|
| 21 |
+
val_crop_size = 224
|
| 22 |
+
|
| 23 |
+
[optim]
|
| 24 |
+
name = "sgd"
|
| 25 |
+
weight_decay = 3.0517578125e-05
|
| 26 |
+
momentum = 0.875
|
| 27 |
+
|
| 28 |
+
[lr_scheduler]
|
| 29 |
+
name = "cosine"
|
| 30 |
+
warmup_epochs = 5
|
| 31 |
+
warmup_decay = 0.01
|
| 32 |
+
eta_min = 1e-05
|
| 33 |
+
|
| 34 |
+
[reproduce]
|
| 35 |
+
seed = 810975
|
| 36 |
+
|
| 37 |
+
[log]
|
| 38 |
+
log_freq = 100
|
| 39 |
+
wandb_on = true
|
| 40 |
+
wandb_project = "decent-sam"
|
| 41 |
+
checkpoint_freq = 45
|
| 42 |
+
job_id = "3396814"
|
| 43 |
+
log_dir = "/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396814"
|
| 44 |
+
|
| 45 |
+
[network]
|
| 46 |
+
world_size = 8
|
| 47 |
+
rank = 0
|
| 48 |
+
local_rank = 0
|
| 49 |
+
local_world_size = 4
|
| 50 |
+
node_list = "alvis7-[11-12]"
|
DSGDm-8-complete/3396814/train_cfg.toml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 1024
|
| 2 |
+
max_epochs = 90
|
| 3 |
+
lr = 1.0
|
| 4 |
+
label_smoothing = 0.1
|
| 5 |
+
arch = "resnet50"
|
| 6 |
+
use_amp = true
|
| 7 |
+
|
| 8 |
+
[backend]
|
| 9 |
+
name = 'decent-dp'
|
| 10 |
+
topology = 'complete'
|
| 11 |
+
|
| 12 |
+
[preprocess]
|
| 13 |
+
preload_local = true
|
| 14 |
+
interpolation = "bilinear"
|
| 15 |
+
train_crop_size = 176
|
| 16 |
+
val_image_size = 256
|
| 17 |
+
val_crop_size = 224
|
| 18 |
+
|
| 19 |
+
[optim]
|
| 20 |
+
name = 'sgd'
|
| 21 |
+
momentum = 0.875
|
| 22 |
+
weight_decay = 0.000030517578125
|
| 23 |
+
|
| 24 |
+
[lr_scheduler]
|
| 25 |
+
name = 'cosine'
|
| 26 |
+
warmup_epochs = 5
|
| 27 |
+
warmup_decay = 0.01
|
| 28 |
+
|
| 29 |
+
[reproduce]
|
| 30 |
+
seed = 810975
|
| 31 |
+
|
| 32 |
+
[log]
|
| 33 |
+
log_freq = 100
|
| 34 |
+
wandb_on = true
|
| 35 |
+
wandb_project = "decent-sam"
|
| 36 |
+
checkpoint_freq = 45
|
DSGDm-8-complete/3396814/train_log.csv
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
epoch,step,train_loss,val_loss,val_acc1,val_acc5,time,checkpoint_dir
|
| 2 |
+
1,1251,6.127042294263268,5.393187141819,9.415999994888306,24.01799998046875,171.02069878578186,
|
| 3 |
+
2,2502,4.919748115072624,4.310746669931412,23.879999983062746,48.737999978942874,264.15189838409424,
|
| 4 |
+
3,3753,4.192763880550338,3.8052967199516297,33.935999976348874,60.11399998168945,356.6551320552826,
|
| 5 |
+
4,5004,3.8018151840908256,3.5090049266910555,39.88599997238159,66.44000001068115,448.89388370513916,
|
| 6 |
+
5,6255,3.5797099125899856,3.543133006410599,39.961999980163576,65.75999998565673,540.9955537319183,
|
| 7 |
+
6,7506,3.4164077854223196,3.179875017986298,46.81599998474121,73.16600000091553,632.9472050666809,
|
| 8 |
+
7,8757,3.2814336923434198,3.027001599264145,51.16999999053955,76.78600000061036,725.122312784195,
|
| 9 |
+
8,10008,3.1878556112448373,2.947021433906555,52.11799996795654,77.601999977417,817.4653673171997,
|
| 10 |
+
9,11259,3.1188443166365345,2.863088867096901,54.86399998687744,79.31199996948243,909.805721282959,
|
| 11 |
+
10,12510,3.0681457159568746,2.843212861433029,55.32599997894287,79.95799997955322,1002.1738886833191,
|
| 12 |
+
11,13761,3.0237617943403152,2.7861382486343382,55.98199997192383,80.5299999899292,1101.837260723114,
|
| 13 |
+
12,15012,2.9884546110384185,2.850523549041748,55.24799998901367,79.8479999710083,1193.7801671028137,
|
| 14 |
+
13,16263,2.9541580187235708,2.849518372297287,54.727999989013675,79.49399997619629,1285.6997277736664,
|
| 15 |
+
14,17514,2.9270401369515273,2.7073931032371523,58.00999997772217,82.0199999609375,1385.2878336906433,
|
| 16 |
+
15,18765,2.902910311587995,2.7079979345607756,57.44399997161865,81.63,1477.1498582363129,
|
| 17 |
+
16,20016,2.8812622801720096,2.7070354832077026,58.13599995605469,81.85399996612549,1568.9361598491669,
|
| 18 |
+
17,21267,2.858700812267933,2.676149680290222,59.13600000061035,82.65199996429443,1668.345825433731,
|
| 19 |
+
18,22518,2.8411981567537947,2.698109716157913,58.33599996612549,82.16799997070312,1760.14688038826,
|
| 20 |
+
19,23769,2.8249180473440845,2.7167605709171294,58.15199996765137,82.05399999267578,1852.1880807876587,
|
| 21 |
+
20,25020,2.807371502704948,2.5683171302080154,61.2079999899292,84.14799996917725,1944.4754102230072,
|
| 22 |
+
21,26271,2.793356894207992,2.7078482171201705,59.51799998657226,82.68399997192383,2036.822396993637,
|
| 23 |
+
22,27522,2.777170517819105,2.637347710971832,60.355999951171874,83.30399999267578,2129.101192712784,
|
| 24 |
+
23,28773,2.764402003930532,2.6262375885868074,60.78399997314453,83.60599996368408,2221.2840523719788,
|
| 25 |
+
24,30024,2.7518614687317378,2.6395135861110686,60.178,83.71399999511719,2313.138547182083,
|
| 26 |
+
25,31275,2.73986719895324,2.5539086744832993,62.165999970703126,84.8519999557495,2404.9164850711823,
|
| 27 |
+
26,32526,2.727668600378753,2.5298820478487016,62.27799997131348,85.00399998077393,2496.6911010742188,
|
| 28 |
+
27,33777,2.712991842572733,2.527021091747284,62.38000000396728,85.23199998077392,2596.0159952640533,
|
| 29 |
+
28,35028,2.702992696341851,2.5217206066703794,62.265999991149904,84.96999999267578,2687.8102848529816,
|
| 30 |
+
29,36279,2.690347477781782,2.5366484935188294,62.305999961853026,84.85999997558594,2779.9322304725647,
|
| 31 |
+
30,37530,2.677466812989504,2.4979143647289277,63.065999974060055,85.58599998840332,2879.574129343033,
|
| 32 |
+
31,38781,2.66630786688303,2.4680560279369352,63.2979999899292,85.91599999084472,2971.3960711956024,
|
| 33 |
+
32,40032,2.6550243053076077,2.4834610175657272,64.3399999710083,86.23999998870849,3063.2164845466614,
|
| 34 |
+
33,41283,2.6426750808406316,2.522367994060516,63.09799995697021,85.41600000946045,3162.8338882923126,
|
| 35 |
+
34,42534,2.631698535429202,2.5459611785078047,62.13799999328613,84.60599998291016,3254.787368297577,
|
| 36 |
+
35,43785,2.618261702531438,2.4807283532619477,63.8999999923706,85.79000000671387,3346.574568748474,
|
| 37 |
+
36,45036,2.6088119867941937,2.4904993689632415,63.58199998687744,85.90799997528076,3438.6902253627777,
|
| 38 |
+
37,46287,2.5972135505563827,2.5077360757446288,63.55999998168945,85.77999996917724,3530.874635219574,
|
| 39 |
+
38,47538,2.585281231801668,2.4784041890478132,63.43799998626709,85.66799996368408,3623.1314704418182,
|
| 40 |
+
39,48789,2.5742120447395136,2.5087062915229796,63.097999965515136,85.6260000012207,3715.4180703163147,
|
| 41 |
+
40,50040,2.5596383893423136,2.365015663309097,66.00799998535156,87.39399998596191,3807.6838648319244,
|
| 42 |
+
41,51291,2.5478728759012443,2.3786921441936495,65.76999997833252,87.38799997802734,3899.9565839767456,
|
| 43 |
+
42,52542,2.5356153436273123,2.511224712333679,63.1599999609375,85.66799996673583,3992.22243142128,
|
| 44 |
+
43,53793,2.5232722489096275,2.4498701629924775,64.17599996307374,86.299999972229,4091.8122646808624,
|
| 45 |
+
44,55044,2.5083332883296823,2.3946868074226377,66.29799998901368,87.46799999145507,4183.590287208557,
|
| 46 |
+
45,56295,2.4968910682067977,2.4412733810186387,64.58999997131347,86.41799998840332,4275.411991596222,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396814/checkpoints/model_45.pt
|
| 47 |
+
46,57546,2.484610610990692,2.3499680482530594,67.20599997772217,88.08799995697021,4374.895538806915,
|
| 48 |
+
47,58797,2.4715520979212724,2.4080028086328507,65.50199999725342,87.07999996765136,4466.660610198975,
|
| 49 |
+
48,60048,2.458013077517398,2.3507400181007387,66.89799997680664,88.02199997772217,4558.6080548763275,
|
| 50 |
+
49,61299,2.443213808820021,2.332980908880234,67.02399999145509,88.00199996185303,4658.193074226379,
|
| 51 |
+
50,62550,2.428716633293173,2.2817089555072783,68.12199998565674,88.84199997833252,4750.039880990982,
|
| 52 |
+
51,63801,2.4128139926899346,2.299946264133453,68.16999994476318,88.43199996490479,4841.821264743805,
|
| 53 |
+
52,65052,2.4001326122634605,2.2698288248491285,68.96199997497558,89.1659999835205,4933.92874789238,
|
| 54 |
+
53,66303,2.3864082657485652,2.3503749481487275,67.21599998535156,88.08799996429444,5025.790977239609,
|
| 55 |
+
54,67554,2.3690470303658198,2.252627236652374,69.21599998657227,89.2879999786377,5117.707024812698,
|
| 56 |
+
55,68805,2.352203613217119,2.2708754269838334,68.88399995697021,89.0299999761963,5209.992288351059,
|
| 57 |
+
56,70056,2.336971477007599,2.2444003923130036,69.33399998413086,89.4659999786377,5302.660699605942,
|
| 58 |
+
57,71307,2.322869612182454,2.252038153681755,69.87599996826172,89.66199996795655,5394.947804689407,
|
| 59 |
+
58,72558,2.303248344684581,2.2128678690290453,70.1979999710083,89.9659999520874,5486.980106830597,
|
| 60 |
+
59,73809,2.288970369586556,2.2328822772312162,70.2059999810791,89.95999995697021,5586.585511922836,
|
| 61 |
+
60,75060,2.2723432064032574,2.2107101944732666,70.17399997802734,89.99799996246338,5678.411100149155,
|
| 62 |
+
61,76311,2.2535721100777457,2.1873321256637572,71.1299999710083,90.43199996307374,5770.216231584549,
|
| 63 |
+
62,77562,2.2356884876410548,2.194494543762207,70.99799998138428,90.36199998687744,5869.540560007095,
|
| 64 |
+
63,78813,2.2184946567749235,2.1861774908304215,70.94199997589111,90.51399994659424,5961.295742034912,
|
| 65 |
+
64,80064,2.2009496229777423,2.1907457696056367,71.15199998352051,90.33199999176026,6053.159170150757,
|
| 66 |
+
65,81315,2.1804827656820236,2.2064537956762313,70.41599998168945,89.87799998687744,6152.669972419739,
|
| 67 |
+
66,82566,2.1623485805891116,2.1926137644958494,71.59399997406005,90.46799996826172,6244.473973035812,
|
| 68 |
+
67,83817,2.141807143946441,2.1369869295930863,72.65399998504638,91.18199997924805,6336.294122695923,
|
| 69 |
+
68,85068,2.121135162935554,2.112554798183441,73.03400000091553,91.20199997955322,6428.506133556366,
|
| 70 |
+
69,86319,2.1001662009244533,2.141602432551384,72.56600002227783,90.95999997650146,6520.811984539032,
|
| 71 |
+
70,87570,2.0805942892885323,2.0747684485530855,73.6819999633789,91.84599995513916,6613.024798870087,
|
| 72 |
+
71,88821,2.058419024343971,2.0777628285312653,73.84399996063233,91.9539999633789,6705.051984786987,
|
| 73 |
+
72,90072,2.0374853375504056,2.076268817982674,74.12999997619629,91.99999996307373,6796.896181344986,
|
| 74 |
+
73,91323,2.0162740735579834,2.091451440925598,73.89199997924804,91.98399995239258,6888.699318885803,
|
| 75 |
+
74,92574,1.9920739202405051,2.0530714088726043,74.39799999786376,92.1759999685669,6980.464948892593,
|
| 76 |
+
75,93825,1.9711303551920312,2.0350256051921845,74.98399995147705,92.43399996032714,7079.879634380341,
|
| 77 |
+
76,95076,1.9504838722000877,2.0443485179948806,75.1579999874878,92.36399995239258,7171.66074180603,
|
| 78 |
+
77,96327,1.929455490623542,2.011657994995117,75.62199999267578,92.82799995788574,7263.498735189438,
|
| 79 |
+
78,97578,1.908993336508314,2.0061387955236434,76.07999998199463,92.93799995239257,7362.967435836792,
|
| 80 |
+
79,98829,1.8885678461463236,2.0084419351196288,76.15599997406005,93.011999949646,7454.897849321365,
|
| 81 |
+
80,100080,1.867487807222884,1.989534666991234,76.48199998199463,93.1919999472046,7546.6971027851105,
|
| 82 |
+
81,101331,1.8502448157583782,1.97987747964859,76.56599999542236,93.25399994720459,7646.266963481903,
|
| 83 |
+
82,102582,1.8327934436684699,1.9752510207366942,76.84799997680663,93.369999944458,7738.12802696228,
|
| 84 |
+
83,103833,1.8193008570934084,1.9674947014093398,77.00599994995117,93.42199994720458,7830.075203418732,
|
| 85 |
+
84,105084,1.8070869094652715,1.9616766112709045,77.21199997406006,93.51199994445801,7922.026429891586,
|
| 86 |
+
85,106335,1.7945735428116019,1.9585169447088242,77.2159999822998,93.61799994171143,8014.047864198685,
|
| 87 |
+
86,107586,1.7852201788164348,1.9618321807050705,77.25799998504638,93.58599994171142,8106.137816429138,
|
| 88 |
+
87,108837,1.7775084370975014,1.9541121759557725,77.33799998474122,93.62399995269776,8198.392191886902,
|
| 89 |
+
88,110088,1.773624649925007,1.9565625371456146,77.40199996063232,93.605999944458,8290.748585700989,
|
| 90 |
+
89,111339,1.7710741437096105,1.9537402257823944,77.37199999572753,93.617999944458,8383.015711069107,
|
| 91 |
+
90,112590,1.768616415947366,1.9548244094467162,77.44399997680664,93.64199994445801,8475.517761468887,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396814/checkpoints/model_90.pt
|
DSGDm-8-complete/3396815/checkpoints/model_45.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:245e03e1a3d6c8948c3e217637a60ae49f699aede3528820ccdddfa3065c8abd
|
| 3 |
+
size 102518166
|
DSGDm-8-complete/3396815/checkpoints/model_90.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43665e2f00ab308acaaaa4a2a19febf3ce07060d599bde51c78e27b8f88c7cd7
|
| 3 |
+
size 102518166
|
DSGDm-8-complete/3396815/data_cfg.dump.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_dir = "./data/Imagenet"
|
| 2 |
+
num_classes = 1000
|
| 3 |
+
|
| 4 |
+
[dataloader]
|
| 5 |
+
name = "ffcv"
|
| 6 |
+
processed_data_dir = "./data/ffcv"
|
| 7 |
+
max_resolution = 500
|
| 8 |
+
compress_probability = 1.0
|
| 9 |
+
jpeg_quality = 90
|
| 10 |
+
num_data_workers = 12
|
| 11 |
+
in_memory = true
|
| 12 |
+
tag = "ffcv_500_1.000_90"
|
| 13 |
+
train_data_dir = "./data/ffcv/ffcv_500_1.000_90_train.ffcv"
|
| 14 |
+
val_data_dir = "./data/ffcv/ffcv_500_1.000_90_val.ffcv"
|
DSGDm-8-complete/3396815/data_cfg.toml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_dir = "./data/Imagenet"
|
| 2 |
+
num_classes = 1000
|
| 3 |
+
|
| 4 |
+
[dataloader]
|
| 5 |
+
name = "ffcv"
|
| 6 |
+
processed_data_dir = "./data/ffcv"
|
| 7 |
+
max_resolution = 500
|
| 8 |
+
compress_probability = 1.0
|
| 9 |
+
jpeg_quality = 90
|
| 10 |
+
num_data_workers = 12
|
| 11 |
+
|
| 12 |
+
# [dataloader]
|
| 13 |
+
# name = "dali"
|
| 14 |
+
# preload = true
|
| 15 |
+
# sharded_data_dir = "./data/Imagenet-sharded"
|
| 16 |
+
# num_data_workers = 8
|
DSGDm-8-complete/3396815/err.out
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793]
|
| 2 |
+
W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793] *****************************************
|
| 3 |
+
W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793] *****************************************
|
| 5 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
|
| 6 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
|
| 7 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
|
| 8 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
|
| 9 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
|
| 10 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] run_id : 28584
|
| 11 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
|
| 12 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.171:28052
|
| 13 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
|
| 14 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
|
| 15 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
|
| 16 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396815/torchelastic_ii5e9hc5
|
| 17 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
|
| 18 |
+
I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194]
|
| 19 |
+
W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793]
|
| 20 |
+
W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793] *****************************************
|
| 21 |
+
W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 22 |
+
W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793] *****************************************
|
| 23 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
|
| 24 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
|
| 25 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
|
| 26 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
|
| 27 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
|
| 28 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] run_id : 28584
|
| 29 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
|
| 30 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.171:28052
|
| 31 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
|
| 32 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
|
| 33 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
|
| 34 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396815/torchelastic_lgc1b5s_
|
| 35 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
|
| 36 |
+
I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194]
|
| 37 |
+
I0113 22:27:34.122000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
|
| 38 |
+
I0113 22:27:34.122000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
|
| 39 |
+
I0113 22:27:34.545000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
|
| 40 |
+
I0113 22:27:34.546000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
|
| 41 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
|
| 42 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
|
| 43 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.171
|
| 44 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
|
| 45 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=0
|
| 46 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
|
| 47 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
|
| 48 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[0, 1, 2, 3]
|
| 49 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[0, 1, 2, 3]
|
| 50 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
|
| 51 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
|
| 52 |
+
I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525]
|
| 53 |
+
I0113 22:27:35.394000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
|
| 54 |
+
I0113 22:27:35.394000 4547 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
|
| 55 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
|
| 56 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
|
| 57 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.171
|
| 58 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
|
| 59 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=1
|
| 60 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
|
| 61 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
|
| 62 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[4, 5, 6, 7]
|
| 63 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[4, 5, 6, 7]
|
| 64 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
|
| 65 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
|
| 66 |
+
I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525]
|
| 67 |
+
I0113 22:27:35.395000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
|
| 68 |
+
I0113 22:27:35.395000 4547 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
|
| 69 |
+
I0113 22:27:35.395000 4547 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
|
| 70 |
+
I0113 22:27:35.395000 4590 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
|
| 71 |
+
I0113 22:27:35.395000 4590 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
|
| 72 |
+
I0113 22:27:35.396000 4590 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
|
| 73 |
+
[rank0]:[W113 22:27:49.888137996 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 74 |
+
[rank3]:[W113 22:27:49.888186841 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 75 |
+
[rank1]:[W113 22:27:49.888231426 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 76 |
+
[rank2]:[W113 22:27:49.888452742 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 77 |
+
[rank5]:[W113 22:27:50.531696251 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 78 |
+
[rank6]:[W113 22:27:50.532009107 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 79 |
+
[rank4]:[W113 22:27:50.532444240 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 80 |
+
[rank7]:[W113 22:27:50.532668073 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
|
| 81 |
+
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
|
| 82 |
+
wandb: Currently logged in as: zesen. Use `wandb login --relogin` to force relogin
|
| 83 |
+
wandb: Tracking run with wandb version 0.19.1
|
| 84 |
+
wandb: Run data is saved locally in /local/tmp.3396815/wandb/run-20250113_222756-xtm2f8go
|
| 85 |
+
wandb: Run `wandb offline` to turn off syncing.
|
| 86 |
+
wandb: Syncing run 3396815
|
| 87 |
+
wandb: βοΈ View project at https://wandb.ai/zesen/decent-sam
|
| 88 |
+
wandb: π View run at https://wandb.ai/zesen/decent-sam/runs/xtm2f8go
|
| 89 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 90 |
+
warnings.warn(
|
| 91 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 92 |
+
warnings.warn(
|
| 93 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 94 |
+
warnings.warn(
|
| 95 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 96 |
+
warnings.warn(
|
| 97 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 98 |
+
warnings.warn(
|
| 99 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 100 |
+
warnings.warn(
|
| 101 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 102 |
+
warnings.warn(
|
| 103 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
| 104 |
+
warnings.warn(
|
| 105 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 106 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 107 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 108 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 109 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 110 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 111 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 112 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 113 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 114 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 115 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 116 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 117 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 118 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 119 |
+
/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
|
| 120 |
+
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
|
| 121 |
+
wandb: uploading config.yaml; uploading output.log
|
| 122 |
+
wandb:
|
| 123 |
+
[rank1]:[W114 00:53:39.318429238 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 124 |
+
[rank6]:[W114 00:53:39.385046968 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 125 |
+
[rank7]:[W114 00:53:39.411510035 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 126 |
+
[rank5]:[W114 00:53:39.417334380 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 127 |
+
[rank2]:[W114 00:53:39.476722623 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 128 |
+
[rank3]:[W114 00:53:39.594174452 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 129 |
+
wandb:
|
| 130 |
+
wandb: Run history:
|
| 131 |
+
wandb: epoch βββββββββββββββββββββ
β
β
β
β
β
β
βββββββββββββ
|
| 132 |
+
wandb: epoch_train_time ββββββββββββββββββββββββββββββββββββββββ
|
| 133 |
+
wandb: loss βββββββ
β
β
β
β
β
β
β
β
βββββββοΏ½οΏ½βββββββββββββββββ
|
| 134 |
+
wandb: lr βββ
βββββββββββββββββββββ
β
βββββββββββββββ
|
| 135 |
+
wandb: total_train_time βββββββββββββββββββββββ
β
β
β
β
βββββββββββββ
|
| 136 |
+
wandb: val_acc1 βββ
β
β
βββββββββββββββββββββββββββββββββββ
|
| 137 |
+
wandb: val_acc5 βββ
β
ββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
wandb: val_loss ββ
ββββββββββββββββββββββββββββββββββββββ
|
| 139 |
+
wandb:
|
| 140 |
+
wandb: Run summary:
|
| 141 |
+
wandb: epoch 90
|
| 142 |
+
wandb: epoch_train_time 92.11154
|
| 143 |
+
wandb: loss 1.76962
|
| 144 |
+
wandb: lr 1e-05
|
| 145 |
+
wandb: total_train_time 8472.76438
|
| 146 |
+
wandb: val_acc1 77.424
|
| 147 |
+
wandb: val_acc5 93.626
|
| 148 |
+
wandb: val_loss 1.95212
|
| 149 |
+
wandb:
|
| 150 |
+
wandb: π View run 3396815 at: https://wandb.ai/zesen/decent-sam/runs/xtm2f8go
|
| 151 |
+
wandb: βοΈ View project at: https://wandb.ai/zesen/decent-sam
|
| 152 |
+
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
|
| 153 |
+
wandb: Find logs at: /local/tmp.3396815/wandb/run-20250113_222756-xtm2f8go/logs
|
| 154 |
+
[rank4]:[W114 00:53:40.296904513 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 155 |
+
[rank0]:[W114 00:53:42.625334345 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
|
| 156 |
+
I0114 00:53:49.018000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
|
| 157 |
+
I0114 00:53:49.020000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
|
| 158 |
+
I0114 00:53:50.667000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
|
| 159 |
+
I0114 00:53:50.671000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
|
| 160 |
+
I0114 00:53:50.673000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 1.6517422199249268 seconds
|
| 161 |
+
I0114 00:53:50.673000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.0016102790832519531 seconds
|
DSGDm-8-complete/3396815/log.out
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
DSGDm-8-complete/3396815/tb_trace/worker_00.1736803740196062136.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:698fa85f677b0ee04ff6b110da0ff8dc437f180cd11345309c594d9b3015a2d6
|
| 3 |
+
size 15124021
|
DSGDm-8-complete/3396815/tb_trace/worker_01.1736803740195517240.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a816ca68f36cdbc4b5077d234efacd35f5896cb3536d72288ba964e0e87299f9
|
| 3 |
+
size 15018027
|
DSGDm-8-complete/3396815/tb_trace/worker_02.1736803740198629006.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97f5a14852e2cc062db678418494a2a262914e0e4daa513c426bd9307d1bd8f8
|
| 3 |
+
size 14978834
|
DSGDm-8-complete/3396815/tb_trace/worker_03.1736803740196019080.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef92f5bce4215d5eace0a69adc734a94b040839b29b5ca852e28c1a9551d70ef
|
| 3 |
+
size 15083314
|
DSGDm-8-complete/3396815/tb_trace/worker_04.1736803740196645296.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d40d4edc76fee681975349672c9fd9742c5fb5caa7d431d29ad64aae3e9b15e6
|
| 3 |
+
size 15064200
|
DSGDm-8-complete/3396815/tb_trace/worker_05.1736803740196646399.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84653bdf8ff5388deb57bc282a4a29eeb60f5f09317a2169641667ba68319fee
|
| 3 |
+
size 15027919
|
DSGDm-8-complete/3396815/tb_trace/worker_06.1736803740196720663.pt.trace.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57c12e1c1fbb6be2abfffeca0ae2444f8257cb686852226385d4d252efdf7242
|
| 3 |
+
size 15094896
|