zesen-kth commited on
Commit
9bcd604
Β·
verified Β·
1 Parent(s): 465f74a

Update models for DSGDm-8-complete

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitattributes +24 -0
  2. DSGDm-8-complete/3396810/checkpoints/model_45.pt +3 -0
  3. DSGDm-8-complete/3396810/checkpoints/model_90.pt +3 -0
  4. DSGDm-8-complete/3396810/data_cfg.dump.toml +14 -0
  5. DSGDm-8-complete/3396810/data_cfg.toml +16 -0
  6. DSGDm-8-complete/3396810/err.out +160 -0
  7. DSGDm-8-complete/3396810/log.out +0 -0
  8. DSGDm-8-complete/3396810/tb_trace/worker_00.1736803679700367017.pt.trace.json +3 -0
  9. DSGDm-8-complete/3396810/tb_trace/worker_01.1736803679694837989.pt.trace.json +3 -0
  10. DSGDm-8-complete/3396810/tb_trace/worker_02.1736803679700306866.pt.trace.json +3 -0
  11. DSGDm-8-complete/3396810/tb_trace/worker_03.1736803679701051019.pt.trace.json +3 -0
  12. DSGDm-8-complete/3396810/tb_trace/worker_04.1736803679699263562.pt.trace.json +3 -0
  13. DSGDm-8-complete/3396810/tb_trace/worker_05.1736803679694488849.pt.trace.json +3 -0
  14. DSGDm-8-complete/3396810/tb_trace/worker_06.1736803679698583528.pt.trace.json +3 -0
  15. DSGDm-8-complete/3396810/tb_trace/worker_07.1736803679698570448.pt.trace.json +3 -0
  16. DSGDm-8-complete/3396810/test_results.csv +3 -0
  17. DSGDm-8-complete/3396810/train_cfg.dump.toml +50 -0
  18. DSGDm-8-complete/3396810/train_cfg.toml +36 -0
  19. DSGDm-8-complete/3396810/train_log.csv +91 -0
  20. DSGDm-8-complete/3396814/checkpoints/model_45.pt +3 -0
  21. DSGDm-8-complete/3396814/checkpoints/model_90.pt +3 -0
  22. DSGDm-8-complete/3396814/data_cfg.dump.toml +14 -0
  23. DSGDm-8-complete/3396814/data_cfg.toml +16 -0
  24. DSGDm-8-complete/3396814/err.out +218 -0
  25. DSGDm-8-complete/3396814/log.out +0 -0
  26. DSGDm-8-complete/3396814/tb_trace/worker_00.1736803741716515488.pt.trace.json +3 -0
  27. DSGDm-8-complete/3396814/tb_trace/worker_01.1736803741715256737.pt.trace.json +3 -0
  28. DSGDm-8-complete/3396814/tb_trace/worker_02.1736803741714535034.pt.trace.json +3 -0
  29. DSGDm-8-complete/3396814/tb_trace/worker_03.1736803741708035584.pt.trace.json +3 -0
  30. DSGDm-8-complete/3396814/tb_trace/worker_04.1736803741713051100.pt.trace.json +3 -0
  31. DSGDm-8-complete/3396814/tb_trace/worker_05.1736803741713040608.pt.trace.json +3 -0
  32. DSGDm-8-complete/3396814/tb_trace/worker_06.1736803741713039723.pt.trace.json +3 -0
  33. DSGDm-8-complete/3396814/tb_trace/worker_07.1736803741713054925.pt.trace.json +3 -0
  34. DSGDm-8-complete/3396814/test_results.csv +3 -0
  35. DSGDm-8-complete/3396814/train_cfg.dump.toml +50 -0
  36. DSGDm-8-complete/3396814/train_cfg.toml +36 -0
  37. DSGDm-8-complete/3396814/train_log.csv +91 -0
  38. DSGDm-8-complete/3396815/checkpoints/model_45.pt +3 -0
  39. DSGDm-8-complete/3396815/checkpoints/model_90.pt +3 -0
  40. DSGDm-8-complete/3396815/data_cfg.dump.toml +14 -0
  41. DSGDm-8-complete/3396815/data_cfg.toml +16 -0
  42. DSGDm-8-complete/3396815/err.out +161 -0
  43. DSGDm-8-complete/3396815/log.out +0 -0
  44. DSGDm-8-complete/3396815/tb_trace/worker_00.1736803740196062136.pt.trace.json +3 -0
  45. DSGDm-8-complete/3396815/tb_trace/worker_01.1736803740195517240.pt.trace.json +3 -0
  46. DSGDm-8-complete/3396815/tb_trace/worker_02.1736803740198629006.pt.trace.json +3 -0
  47. DSGDm-8-complete/3396815/tb_trace/worker_03.1736803740196019080.pt.trace.json +3 -0
  48. DSGDm-8-complete/3396815/tb_trace/worker_04.1736803740196645296.pt.trace.json +3 -0
  49. DSGDm-8-complete/3396815/tb_trace/worker_05.1736803740196646399.pt.trace.json +3 -0
  50. DSGDm-8-complete/3396815/tb_trace/worker_06.1736803740196720663.pt.trace.json +3 -0
.gitattributes CHANGED
@@ -69,3 +69,27 @@ DSGDm-8-ring/3396800/tb_trace/worker_04.1736802649422631598.pt.trace.json filter
69
  DSGDm-8-ring/3396800/tb_trace/worker_05.1736802649420739266.pt.trace.json filter=lfs diff=lfs merge=lfs -text
70
  DSGDm-8-ring/3396800/tb_trace/worker_06.1736802649437161614.pt.trace.json filter=lfs diff=lfs merge=lfs -text
71
  DSGDm-8-ring/3396800/tb_trace/worker_07.1736802649437846610.pt.trace.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  DSGDm-8-ring/3396800/tb_trace/worker_05.1736802649420739266.pt.trace.json filter=lfs diff=lfs merge=lfs -text
70
  DSGDm-8-ring/3396800/tb_trace/worker_06.1736802649437161614.pt.trace.json filter=lfs diff=lfs merge=lfs -text
71
  DSGDm-8-ring/3396800/tb_trace/worker_07.1736802649437846610.pt.trace.json filter=lfs diff=lfs merge=lfs -text
72
+ DSGDm-8-complete/3396810/tb_trace/worker_00.1736803679700367017.pt.trace.json filter=lfs diff=lfs merge=lfs -text
73
+ DSGDm-8-complete/3396810/tb_trace/worker_01.1736803679694837989.pt.trace.json filter=lfs diff=lfs merge=lfs -text
74
+ DSGDm-8-complete/3396810/tb_trace/worker_02.1736803679700306866.pt.trace.json filter=lfs diff=lfs merge=lfs -text
75
+ DSGDm-8-complete/3396810/tb_trace/worker_03.1736803679701051019.pt.trace.json filter=lfs diff=lfs merge=lfs -text
76
+ DSGDm-8-complete/3396810/tb_trace/worker_04.1736803679699263562.pt.trace.json filter=lfs diff=lfs merge=lfs -text
77
+ DSGDm-8-complete/3396810/tb_trace/worker_05.1736803679694488849.pt.trace.json filter=lfs diff=lfs merge=lfs -text
78
+ DSGDm-8-complete/3396810/tb_trace/worker_06.1736803679698583528.pt.trace.json filter=lfs diff=lfs merge=lfs -text
79
+ DSGDm-8-complete/3396810/tb_trace/worker_07.1736803679698570448.pt.trace.json filter=lfs diff=lfs merge=lfs -text
80
+ DSGDm-8-complete/3396814/tb_trace/worker_00.1736803741716515488.pt.trace.json filter=lfs diff=lfs merge=lfs -text
81
+ DSGDm-8-complete/3396814/tb_trace/worker_01.1736803741715256737.pt.trace.json filter=lfs diff=lfs merge=lfs -text
82
+ DSGDm-8-complete/3396814/tb_trace/worker_02.1736803741714535034.pt.trace.json filter=lfs diff=lfs merge=lfs -text
83
+ DSGDm-8-complete/3396814/tb_trace/worker_03.1736803741708035584.pt.trace.json filter=lfs diff=lfs merge=lfs -text
84
+ DSGDm-8-complete/3396814/tb_trace/worker_04.1736803741713051100.pt.trace.json filter=lfs diff=lfs merge=lfs -text
85
+ DSGDm-8-complete/3396814/tb_trace/worker_05.1736803741713040608.pt.trace.json filter=lfs diff=lfs merge=lfs -text
86
+ DSGDm-8-complete/3396814/tb_trace/worker_06.1736803741713039723.pt.trace.json filter=lfs diff=lfs merge=lfs -text
87
+ DSGDm-8-complete/3396814/tb_trace/worker_07.1736803741713054925.pt.trace.json filter=lfs diff=lfs merge=lfs -text
88
+ DSGDm-8-complete/3396815/tb_trace/worker_00.1736803740196062136.pt.trace.json filter=lfs diff=lfs merge=lfs -text
89
+ DSGDm-8-complete/3396815/tb_trace/worker_01.1736803740195517240.pt.trace.json filter=lfs diff=lfs merge=lfs -text
90
+ DSGDm-8-complete/3396815/tb_trace/worker_02.1736803740198629006.pt.trace.json filter=lfs diff=lfs merge=lfs -text
91
+ DSGDm-8-complete/3396815/tb_trace/worker_03.1736803740196019080.pt.trace.json filter=lfs diff=lfs merge=lfs -text
92
+ DSGDm-8-complete/3396815/tb_trace/worker_04.1736803740196645296.pt.trace.json filter=lfs diff=lfs merge=lfs -text
93
+ DSGDm-8-complete/3396815/tb_trace/worker_05.1736803740196646399.pt.trace.json filter=lfs diff=lfs merge=lfs -text
94
+ DSGDm-8-complete/3396815/tb_trace/worker_06.1736803740196720663.pt.trace.json filter=lfs diff=lfs merge=lfs -text
95
+ DSGDm-8-complete/3396815/tb_trace/worker_07.1736803740196718891.pt.trace.json filter=lfs diff=lfs merge=lfs -text
DSGDm-8-complete/3396810/checkpoints/model_45.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95bcc99e642ccb43936d3df59ad3a9bbf7bf6e077135484f1aeddb493423fc9d
3
+ size 102518166
DSGDm-8-complete/3396810/checkpoints/model_90.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98b5a3006e6f3d99def1f35e3e9e873c55bc4f69df3cebe886cdf83d63ce02c1
3
+ size 102518166
DSGDm-8-complete/3396810/data_cfg.dump.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_dir = "./data/Imagenet"
2
+ num_classes = 1000
3
+
4
+ [dataloader]
5
+ name = "ffcv"
6
+ processed_data_dir = "./data/ffcv"
7
+ max_resolution = 500
8
+ compress_probability = 1.0
9
+ jpeg_quality = 90
10
+ num_data_workers = 12
11
+ in_memory = true
12
+ tag = "ffcv_500_1.000_90"
13
+ train_data_dir = "./data/ffcv/ffcv_500_1.000_90_train.ffcv"
14
+ val_data_dir = "./data/ffcv/ffcv_500_1.000_90_val.ffcv"
DSGDm-8-complete/3396810/data_cfg.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_dir = "./data/Imagenet"
2
+ num_classes = 1000
3
+
4
+ [dataloader]
5
+ name = "ffcv"
6
+ processed_data_dir = "./data/ffcv"
7
+ max_resolution = 500
8
+ compress_probability = 1.0
9
+ jpeg_quality = 90
10
+ num_data_workers = 12
11
+
12
+ # [dataloader]
13
+ # name = "dali"
14
+ # preload = true
15
+ # sharded_data_dir = "./data/Imagenet-sharded"
16
+ # num_data_workers = 8
DSGDm-8-complete/3396810/err.out ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793]
2
+ W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793] *****************************************
3
+ W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ W0113 22:26:26.346000 4536 site-packages/torch/distributed/run.py:793] *****************************************
5
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
6
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
7
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
8
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
9
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
10
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] run_id : 30041
11
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
12
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.161:28052
13
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
14
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
15
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
16
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396810/torchelastic_brzz4zhi
17
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
18
+ I0113 22:26:26.347000 4536 site-packages/torch/distributed/launcher/api.py:194]
19
+ I0113 22:26:26.357000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
20
+ I0113 22:26:26.358000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
21
+ W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793]
22
+ W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793] *****************************************
23
+ W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
24
+ W0113 22:26:27.631000 4457 site-packages/torch/distributed/run.py:793] *****************************************
25
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
26
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
27
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
28
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
29
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
30
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] run_id : 30041
31
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
32
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.161:28052
33
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
34
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
35
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
36
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396810/torchelastic_1cs_tjn0
37
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
38
+ I0113 22:26:27.632000 4457 site-packages/torch/distributed/launcher/api.py:194]
39
+ I0113 22:26:27.643000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
40
+ I0113 22:26:27.644000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
41
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
42
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
43
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.161
44
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
45
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=0
46
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
47
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
48
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[0, 1, 2, 3]
49
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[0, 1, 2, 3]
50
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
51
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
52
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:525]
53
+ I0113 22:26:28.627000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
54
+ I0113 22:26:28.628000 4536 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
55
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
56
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
57
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.161
58
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
59
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=1
60
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
61
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
62
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[4, 5, 6, 7]
63
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[4, 5, 6, 7]
64
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
65
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
66
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:525]
67
+ I0113 22:26:28.628000 4536 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
68
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
69
+ I0113 22:26:28.628000 4457 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
70
+ I0113 22:26:28.629000 4536 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
71
+ I0113 22:26:28.629000 4457 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
72
+ I0113 22:26:28.629000 4457 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
73
+ [rank1]:[W113 22:26:44.646740965 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
74
+ [rank3]:[W113 22:26:44.646748763 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
75
+ [rank0]:[W113 22:26:44.646756097 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
76
+ [rank2]:[W113 22:26:44.646843940 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
77
+ [rank4]:[W113 22:26:46.899645566 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
78
+ [rank7]:[W113 22:26:46.900202635 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
79
+ [rank6]:[W113 22:26:46.900424479 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
80
+ [rank5]:[W113 22:26:46.900872526 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
81
+ wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
82
+ wandb: Currently logged in as: zesen. Use `wandb login --relogin` to force relogin
83
+ wandb: Tracking run with wandb version 0.19.1
84
+ wandb: Run data is saved locally in /local/tmp.3396810/wandb/run-20250113_222653-b5c2ab3l
85
+ wandb: Run `wandb offline` to turn off syncing.
86
+ wandb: Syncing run 3396810
87
+ wandb: ⭐️ View project at https://wandb.ai/zesen/decent-sam
88
+ wandb: πŸš€ View run at https://wandb.ai/zesen/decent-sam/runs/b5c2ab3l
89
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
90
+ warnings.warn(
91
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
92
+ warnings.warn(
93
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
94
+ warnings.warn(
95
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
96
+ warnings.warn(
97
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
98
+ warnings.warn(
99
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
100
+ warnings.warn(
101
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
102
+ warnings.warn(
103
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
104
+ warnings.warn(
105
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
106
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
107
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
108
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
109
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
110
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
111
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
112
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
113
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
114
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
115
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
116
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
117
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
118
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
119
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
120
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
121
+ wandb:
122
+ [rank3]:[W114 00:52:45.077889515 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
123
+ wandb:
124
+ wandb: Run history:
125
+ wandb: epoch β–β–β–β–β–‚β–‚β–‚β–‚β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–„β–„β–„β–„β–„β–„β–…β–…β–…β–…β–…β–…β–†β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆ
126
+ wandb: epoch_train_time β–ˆβ–β–β–β–β–β–β–β–β–β–β–β–‚β–β–β–‚β–β–β–β–β–‚β–‚β–β–β–β–β–β–‚β–β–β–β–β–‚β–β–‚β–β–β–β–β–
127
+ wandb: loss β–ˆβ–…β–…β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–β–β–β–β–β–β–
128
+ wandb: lr β–β–β–‚β–ƒβ–…β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–‡β–‡β–‡β–†β–†β–†β–†β–…β–…β–…β–„β–„β–„β–„β–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–β–β–
129
+ wandb: total_train_time β–β–β–β–β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–„β–„β–„β–„β–„β–…β–…β–…β–…β–…β–…β–…β–…β–…β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆ
130
+ wandb: val_acc1 β–β–‚β–„β–…β–…β–…β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ
131
+ wandb: val_acc5 β–β–ƒβ–†β–†β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–ˆβ–‡β–‡β–‡β–ˆβ–ˆβ–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ
132
+ wandb: val_loss β–ˆβ–‡β–…β–…β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–β–β–β–β–
133
+ wandb:
134
+ wandb: Run summary:
135
+ wandb: epoch 90
136
+ wandb: epoch_train_time 92.1866
137
+ wandb: loss 1.75573
138
+ wandb: lr 1e-05
139
+ wandb: total_train_time 8481.97145
140
+ wandb: val_acc1 77.52
141
+ wandb: val_acc5 93.644
142
+ wandb: val_loss 1.95997
143
+ wandb:
144
+ [rank6]:[W114 00:52:46.212017408 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
145
+ [rank1]:[W114 00:52:46.299757777 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
146
+ [rank7]:[W114 00:52:46.318365775 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
147
+ [rank2]:[W114 00:52:46.362530996 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
148
+ [rank5]:[W114 00:52:46.355965645 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
149
+ [rank4]:[W114 00:52:46.356968285 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
150
+ wandb: πŸš€ View run 3396810 at: https://wandb.ai/zesen/decent-sam/runs/b5c2ab3l
151
+ wandb: ⭐️ View project at: https://wandb.ai/zesen/decent-sam
152
+ wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
153
+ wandb: Find logs at: /local/tmp.3396810/wandb/run-20250113_222653-b5c2ab3l/logs
154
+ [rank0]:[W114 00:52:49.322995684 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
155
+ I0114 00:52:56.487000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
156
+ I0114 00:52:56.489000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
157
+ I0114 00:53:01.055000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
158
+ I0114 00:53:01.059000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
159
+ I0114 00:53:01.060000 4457 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.0008060932159423828 seconds
160
+ I0114 00:53:01.061000 4536 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 4.571002244949341 seconds
DSGDm-8-complete/3396810/log.out ADDED
The diff for this file is too large to render. See raw diff
 
DSGDm-8-complete/3396810/tb_trace/worker_00.1736803679700367017.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28868768a79899aeb44d84454a68bb24e4aa65dd94086a1b4a3079bb0ec0973a
3
+ size 15106886
DSGDm-8-complete/3396810/tb_trace/worker_01.1736803679694837989.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a16ed4e388c73efea1694f3533f2598cb83ccb4078b75ea2454acadf864cc88
3
+ size 15018765
DSGDm-8-complete/3396810/tb_trace/worker_02.1736803679700306866.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b0f1778fc24a2e2101452d9bec39462dbd3ece9206cfd28c75e2b6f29ae7647
3
+ size 15068964
DSGDm-8-complete/3396810/tb_trace/worker_03.1736803679701051019.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb16e2208b3dc4473ecc9ae2820092a6be5ae09ec98c603fb4552b2f4be301f
3
+ size 14998443
DSGDm-8-complete/3396810/tb_trace/worker_04.1736803679699263562.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28a2aacea07d17081b43b5d9672959b9ad1592ad99a73ace4b2faa3b098c905e
3
+ size 15038253
DSGDm-8-complete/3396810/tb_trace/worker_05.1736803679694488849.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52d25b7ed8450330e76ce6f7fc8c4724de57cd25cc1e39c98840bacd1ad13d5e
3
+ size 15159249
DSGDm-8-complete/3396810/tb_trace/worker_06.1736803679698583528.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f9897774b01c14a46f5c95e794793d26ac33c77737753f882c5a6d1474110cb
3
+ size 15033494
DSGDm-8-complete/3396810/tb_trace/worker_07.1736803679698570448.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:449570c66b5d288c3e8742b63837abdbd235376466ccb335d9478c6153f8ed85
3
+ size 15086969
DSGDm-8-complete/3396810/test_results.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ epoch,val_loss,val_acc1,val_acc5,val_samples
2
+ 45.0,2.2337640564727783,68.86999997314453,89.11200002441406,50000.0
3
+ 90.0,1.9381773719406128,77.48400000976562,93.65199993896485,50000.0
DSGDm-8-complete/3396810/train_cfg.dump.toml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size = 1024
2
+ max_epochs = 90
3
+ lr = 1.0
4
+ label_smoothing = 0.1
5
+ grad_clip_norm = 0.0
6
+ checkpoint_dir = ""
7
+ arch = "resnet50"
8
+ use_amp = true
9
+ num_samples_for_stats = 102400
10
+ batch_size_per_local_batch = 128
11
+
12
+ [backend]
13
+ name = "decent-dp"
14
+ topology = "complete"
15
+
16
+ [preprocess]
17
+ preload_local = true
18
+ interpolation = "bilinear"
19
+ train_crop_size = 176
20
+ val_image_size = 256
21
+ val_crop_size = 224
22
+
23
+ [optim]
24
+ name = "sgd"
25
+ weight_decay = 3.0517578125e-05
26
+ momentum = 0.875
27
+
28
+ [lr_scheduler]
29
+ name = "cosine"
30
+ warmup_epochs = 5
31
+ warmup_decay = 0.01
32
+ eta_min = 1e-05
33
+
34
+ [reproduce]
35
+ seed = 810976
36
+
37
+ [log]
38
+ log_freq = 100
39
+ wandb_on = true
40
+ wandb_project = "decent-sam"
41
+ checkpoint_freq = 45
42
+ job_id = "3396810"
43
+ log_dir = "/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396810"
44
+
45
+ [network]
46
+ world_size = 8
47
+ rank = 0
48
+ local_rank = 0
49
+ local_world_size = 4
50
+ node_list = "alvis7-[09-10]"
DSGDm-8-complete/3396810/train_cfg.toml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size = 1024
2
+ max_epochs = 90
3
+ lr = 1.0
4
+ label_smoothing = 0.1
5
+ arch = "resnet50"
6
+ use_amp = true
7
+
8
+ [backend]
9
+ name = 'decent-dp'
10
+ topology = 'complete'
11
+
12
+ [preprocess]
13
+ preload_local = true
14
+ interpolation = "bilinear"
15
+ train_crop_size = 176
16
+ val_image_size = 256
17
+ val_crop_size = 224
18
+
19
+ [optim]
20
+ name = 'sgd'
21
+ momentum = 0.875
22
+ weight_decay = 0.000030517578125
23
+
24
+ [lr_scheduler]
25
+ name = 'cosine'
26
+ warmup_epochs = 5
27
+ warmup_decay = 0.01
28
+
29
+ [reproduce]
30
+ seed = 810976
31
+
32
+ [log]
33
+ log_freq = 100
34
+ wandb_on = true
35
+ wandb_project = "decent-sam"
36
+ checkpoint_freq = 45
DSGDm-8-complete/3396810/train_log.csv ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,step,train_loss,val_loss,val_acc1,val_acc5,time,checkpoint_dir
2
+ 1,1251,6.115497382234135,5.49630715549469,8.321999994430541,22.581999992523194,178.20069289207458,
3
+ 2,2502,4.902603116562422,4.38631671503067,23.0619999798584,46.49800000289917,272.038290977478,
4
+ 3,3753,4.195299116017626,3.9677681756305696,33.50599998046875,59.47599997314453,364.5414605140686,
5
+ 4,5004,3.8045895575857656,3.4839741384124756,40.48199998336792,66.93999997497559,456.93470644950867,
6
+ 5,6255,3.587492471595081,3.3035725147724153,45.679999972076416,71.42799997711181,549.1559157371521,
7
+ 6,7506,3.4111864673052663,3.252914929189682,46.39999997375488,72.17200000518798,641.4471440315247,
8
+ 7,8757,3.2772125174387465,3.0007691029167174,51.16199997344971,76.56999998840332,733.5894980430603,
9
+ 8,10008,3.1858509929536534,2.944188568649292,52.62199997467041,77.4099999874878,825.8874089717865,
10
+ 9,11259,3.114799227526815,3.0860951818275453,49.53599997512817,74.72599998565674,918.1725902557373,
11
+ 10,12510,3.0601791883353515,2.801905057754517,55.79999997833252,80.17799998168945,1010.4640154838562,
12
+ 11,13761,3.0145340088031274,2.8203145016384124,55.521999955444336,80.04599997375489,1110.194946527481,
13
+ 12,15012,2.9810065702854587,2.818964365930557,55.29199998260498,79.78199998138427,1202.1528532505035,
14
+ 13,16263,2.9469908509466,2.7131245301103593,57.641999983825684,81.55200000152588,1294.0937585830688,
15
+ 14,17514,2.9168076822750097,2.696461324515343,58.14199997253418,81.8919999798584,1393.63254904747,
16
+ 15,18765,2.8954941391659013,2.7241055217075347,57.52800000976563,81.48799998718262,1485.6434042453766,
17
+ 16,20016,2.872533181111971,2.6555525961971282,59.35599998443603,82.98799996643066,1577.6670188903809,
18
+ 17,21267,2.8503478197790355,2.698758298025131,58.60799997528076,82.30599999786376,1677.0858178138733,
19
+ 18,22518,2.832455455828056,2.673398114233017,58.705999983215335,82.3939999822998,1768.8874039649963,
20
+ 19,23769,2.816942418341061,2.596034429960251,60.95199998413086,84.24199998535157,1860.7363867759705,
21
+ 20,25020,2.7986710402938866,2.5837205569553374,61.309999973754884,84.33599998016358,1952.6566116809845,
22
+ 21,26271,2.7846179530441426,2.7024088632774355,58.051999953308105,81.90399998657226,2044.8731739521027,
23
+ 22,27522,2.769252215715335,2.611931260843277,60.715999960327146,83.91599997741699,2137.1224229335785,
24
+ 23,28773,2.7585869756200427,2.5537972654533387,62.10599998229981,84.76799998840332,2229.266894340515,
25
+ 24,30024,2.7435707499940905,2.557687119922638,61.66799997253418,84.59399999664306,2321.205994606018,
26
+ 25,31275,2.7306130466272505,2.6683655838346483,59.56999998657226,82.65999997741699,2413.2169272899628,
27
+ 26,32526,2.71709777485076,2.524520060300827,62.71799997314453,85.43799997253419,2505.2413029670715,
28
+ 27,33777,2.7053595260321663,2.5170880539512632,62.23199997543335,84.88599996887207,2604.7841408252716,
29
+ 28,35028,2.6954730913984024,2.5253023421144487,61.97999997650147,84.99799997222901,2696.6729650497437,
30
+ 29,36279,2.6833848688575768,2.5719702689266204,61.5839999887085,84.5299999935913,2788.51065325737,
31
+ 30,37530,2.670573327038214,2.5124143429517747,63.17399995239258,85.64599999938964,2887.899392604828,
32
+ 31,38781,2.6598851650977116,2.511490090098381,63.63800001159668,85.55999996459961,2979.737328529358,
33
+ 32,40032,2.646365909434432,2.539848774638176,62.865999978942874,85.07999997406006,3071.771167039871,
34
+ 33,41283,2.63584532190284,2.4440039260196684,64.55799997772216,86.51999998657226,3171.4136126041412,
35
+ 34,42534,2.6246873265166553,2.4617872117948534,64.07999997589111,86.01999999053955,3263.3756012916565,
36
+ 35,43785,2.6102599474928265,2.495519027967453,64.0659999935913,86.00200000488282,3355.366861104965,
37
+ 36,45036,2.600696023348138,2.4298770931959153,64.75199996795654,86.64999998535156,3447.217997074127,
38
+ 37,46287,2.5888351397358065,2.4921805786609648,63.15599997375488,85.52399997772217,3539.190637111664,
39
+ 38,47538,2.5782451668708064,2.465635354757309,64.35399996612549,86.52400001312256,3631.2022793293,
40
+ 39,48789,2.566590094547287,2.3938324451351165,65.39399996826172,86.99999999664307,3723.1156182289124,
41
+ 40,50040,2.5534790206632074,2.3881699456834795,65.55799998321534,86.99199998596191,3815.030925989151,
42
+ 41,51291,2.540958863058441,2.377643189086914,66.41799998596191,87.78799998138427,3906.896488904953,
43
+ 42,52542,2.52921495210829,2.370751012825966,66.04199999267578,87.65399995941162,3998.8248476982117,
44
+ 43,53793,2.5176791096333977,2.447054692735672,65.20199995361328,86.69199997528077,4098.377296209335,
45
+ 44,55044,2.5004122752985127,2.3770597540855407,66.16999997436524,87.63999998382569,4190.288501262665,
46
+ 45,56295,2.4915684670996034,2.3904737115240096,65.55999998291016,86.99599998870849,4282.123118877411,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396810/checkpoints/model_45.pt
47
+ 46,57546,2.4768516317927105,2.370375301232338,66.6199999911499,87.54799998901368,4381.528468608856,
48
+ 47,58797,2.462672047906642,2.3097496503448487,67.64400000366211,88.34999995605469,4473.578360557556,
49
+ 48,60048,2.450321657134951,2.291786767001152,68.1439999621582,88.5859999545288,4565.646231889725,
50
+ 49,61299,2.4367442497913596,2.414863888645172,65.57199995544434,87.01399996490478,4665.047325372696,
51
+ 50,62550,2.421720023456809,2.3640809498119353,66.9279999847412,87.52200000518799,4756.795320510864,
52
+ 51,63801,2.409095025081619,2.337284747476578,67.53199996368409,88.26999998565674,4848.593809604645,
53
+ 52,65052,2.39349576933302,2.3086278829574587,67.99199998260498,88.29399997283936,4940.544646501541,
54
+ 53,66303,2.379139742929396,2.28750909260273,68.62399998016357,88.74199997589112,5032.606946706772,
55
+ 54,67554,2.3633363546727657,2.331979500384331,67.50999998596191,88.1219999597168,5124.647661447525,
56
+ 55,68805,2.3463351102136403,2.2553140204572677,69.30599996551514,89.40599997070312,5216.527634859085,
57
+ 56,70056,2.331912900642049,2.2343211260557174,69.49999996795654,89.48799995758057,5308.56108379364,
58
+ 57,71307,2.3144515739928044,2.233425653114319,69.98399998718261,89.68799997314453,5400.826423168182,
59
+ 58,72558,2.298989494939407,2.267748852825165,69.11199995941162,89.16999995208741,5493.116752147675,
60
+ 59,73809,2.2821754796041858,2.321332974061966,67.68799996368408,88.15399998382568,5592.708019256592,
61
+ 60,75060,2.266402605745337,2.233510379371643,70.10800000549317,89.59599996002197,5684.661290645599,
62
+ 61,76311,2.248882063477636,2.269366139588356,69.6219999621582,89.49599996795655,5776.508955955505,
63
+ 62,77562,2.2304583841638506,2.1631349580574035,71.61999999816895,90.65399994934081,5875.927084207535,
64
+ 63,78813,2.210439218022078,2.1846533494901657,71.30799998931884,90.24399996520997,5967.6671352386475,
65
+ 64,80064,2.1924270626476154,2.20357965007782,70.41599999725342,89.83599999725342,6059.495954990387,
66
+ 65,81315,2.1723172299796154,2.14432814950943,72.68599997406005,90.9159999710083,6158.839286565781,
67
+ 66,82566,2.1546609271034822,2.136791229815483,72.47399998474121,91.01799998413085,6250.693339586258,
68
+ 67,83817,2.1353299207657837,2.1841600111722945,71.4879999734497,90.48399996826171,6342.555477380753,
69
+ 68,85068,2.113898727581274,2.1171528541707993,73.22599997680663,91.27599993896484,6434.511804103851,
70
+ 69,86319,2.0941642940663794,2.08480306992054,73.80799997406005,91.69799995788574,6526.674050331116,
71
+ 70,87570,2.0748744738473595,2.1041335141468047,73.37799997406006,91.47399996032715,6618.951231479645,
72
+ 71,88821,2.05246179085746,2.097317939052582,73.63399998199463,91.635999944458,6711.154579639435,
73
+ 72,90072,2.0299681932162894,2.114106169629097,73.26199998168946,91.47999996612549,6803.397469758987,
74
+ 73,91323,2.0102840042371546,2.05411882917881,74.55999997680664,92.17199994445801,6895.66441822052,
75
+ 74,92574,1.988244648460004,2.0506596167564393,74.5999999899292,92.165999949646,6987.938447475433,
76
+ 75,93825,1.9637918177959348,2.046655902919769,74.71800000335693,92.22999994140625,7087.6016981601715,
77
+ 76,95076,1.946619587610189,2.016001330356598,75.63199997650146,92.66799994415283,7179.433463811874,
78
+ 77,96327,1.9239142328191052,1.9993775571107864,75.95200000396729,92.97799993591309,7271.2513654232025,
79
+ 78,97578,1.9024203764639502,1.994896138973236,76.15399998260499,93.06599995788574,7370.6608374118805,
80
+ 79,98829,1.8808876718048284,1.9910964307689667,76.37600000946045,93.05599993896485,7462.502587795258,
81
+ 80,100080,1.8634167736430438,1.984393134250641,76.21399999053955,93.07199994445801,7554.465767860413,
82
+ 81,101331,1.8463708090124655,1.9833151563167573,76.65400000427246,93.21199995513916,7654.044641971588,
83
+ 82,102582,1.8278751979819494,1.9754268894052505,76.92800000457764,93.27399996063232,7745.87461400032,
84
+ 83,103833,1.8130075675334862,1.9714242833566666,77.15600000457763,93.387999944458,7837.780626296997,
85
+ 84,105084,1.7992366814761043,1.9707070908737183,77.13599999053955,93.515999944458,7929.781843185425,
86
+ 85,106335,1.7906732545863333,1.9659925394535065,77.33200000976562,93.46599993621827,8021.87069940567,
87
+ 86,107586,1.7811057284462462,1.9619173885440826,77.39200000152587,93.54999993896485,8113.900476932526,
88
+ 87,108837,1.775649699673569,1.9636126394271851,77.32399999084473,93.56599993896485,8205.859763145447,
89
+ 88,110088,1.7688615699585297,1.9585561914587022,77.37600000976562,93.6239999472046,8297.732298135757,
90
+ 89,111339,1.766685530031137,1.9597701799440383,77.41200000701905,93.60799993896484,8389.784857749939,
91
+ 90,112590,1.7652602999521008,1.9599693808031082,77.52000001251221,93.64399993896484,8481.97145485878,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396810/checkpoints/model_90.pt
DSGDm-8-complete/3396814/checkpoints/model_45.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbd5a664a9b970d7f93d79e7f0514c25952b4c71f4f0a88f55569c73664edfa9
3
+ size 102518166
DSGDm-8-complete/3396814/checkpoints/model_90.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f15cd9230781645d90afb246083c4ca22ce6b20cf9bebfb726ecd698c97acd60
3
+ size 102518166
DSGDm-8-complete/3396814/data_cfg.dump.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_dir = "./data/Imagenet"
2
+ num_classes = 1000
3
+
4
+ [dataloader]
5
+ name = "ffcv"
6
+ processed_data_dir = "./data/ffcv"
7
+ max_resolution = 500
8
+ compress_probability = 1.0
9
+ jpeg_quality = 90
10
+ num_data_workers = 12
11
+ in_memory = true
12
+ tag = "ffcv_500_1.000_90"
13
+ train_data_dir = "./data/ffcv/ffcv_500_1.000_90_train.ffcv"
14
+ val_data_dir = "./data/ffcv/ffcv_500_1.000_90_val.ffcv"
DSGDm-8-complete/3396814/data_cfg.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_dir = "./data/Imagenet"
2
+ num_classes = 1000
3
+
4
+ [dataloader]
5
+ name = "ffcv"
6
+ processed_data_dir = "./data/ffcv"
7
+ max_resolution = 500
8
+ compress_probability = 1.0
9
+ jpeg_quality = 90
10
+ num_data_workers = 12
11
+
12
+ # [dataloader]
13
+ # name = "dali"
14
+ # preload = true
15
+ # sharded_data_dir = "./data/Imagenet-sharded"
16
+ # num_data_workers = 8
DSGDm-8-complete/3396814/err.out ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793]
2
+ W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793] *****************************************
3
+ W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ W0113 22:27:34.921000 4515 site-packages/torch/distributed/run.py:793] *****************************************
5
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
6
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
7
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
8
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
9
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
10
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] run_id : 26983
11
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
12
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.163:28052
13
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
14
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
15
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
16
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396814/torchelastic_xo1s6d9h
17
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
18
+ I0113 22:27:34.922000 4515 site-packages/torch/distributed/launcher/api.py:194]
19
+ W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793]
20
+ W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793] *****************************************
21
+ W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
22
+ W0113 22:27:35.269000 4639 site-packages/torch/distributed/run.py:793] *****************************************
23
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
24
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
25
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
26
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
27
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
28
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] run_id : 26983
29
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
30
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.163:28052
31
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
32
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
33
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
34
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396814/torchelastic__h0j47iu
35
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
36
+ I0113 22:27:35.270000 4639 site-packages/torch/distributed/launcher/api.py:194]
37
+ I0113 22:27:35.282000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
38
+ I0113 22:27:35.282000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
39
+ I0113 22:27:35.389000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
40
+ I0113 22:27:35.389000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
41
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
42
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
43
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.163
44
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
45
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=0
46
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
47
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
48
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[0, 1, 2, 3]
49
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[0, 1, 2, 3]
50
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
51
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
52
+ I0113 22:27:36.408000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:525]
53
+ I0113 22:27:36.409000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
54
+ I0113 22:27:36.409000 4639 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
55
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
56
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
57
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.163
58
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
59
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=1
60
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
61
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
62
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[4, 5, 6, 7]
63
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[4, 5, 6, 7]
64
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
65
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
66
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:525]
67
+ I0113 22:27:36.410000 4639 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
68
+ I0113 22:27:36.409000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
69
+ I0113 22:27:36.410000 4639 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
70
+ I0113 22:27:36.410000 4515 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
71
+ I0113 22:27:36.410000 4515 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
72
+ I0113 22:27:36.410000 4515 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
73
+ [rank0]:[W113 22:27:51.533565081 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
74
+ [rank1]:[W113 22:27:51.533956704 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
75
+ [rank3]:[W113 22:27:51.535096922 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
76
+ [rank2]:[W113 22:27:51.539950908 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
77
+ [rank5]:[W113 22:27:51.792502773 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
78
+ [rank6]:[W113 22:27:51.792731829 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
79
+ [rank4]:[W113 22:27:51.794218912 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
80
+ [rank7]:[W113 22:27:51.794605275 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
81
+ wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
82
+ wandb: Currently logged in as: zesen. Use `wandb login --relogin` to force relogin
83
+ wandb: Tracking run with wandb version 0.19.1
84
+ wandb: Run data is saved locally in /local/tmp.3396814/wandb/run-20250113_222758-wdn6kgy3
85
+ wandb: Run `wandb offline` to turn off syncing.
86
+ wandb: Syncing run 3396814
87
+ wandb: ⭐️ View project at https://wandb.ai/zesen/decent-sam
88
+ wandb: πŸš€ View run at https://wandb.ai/zesen/decent-sam/runs/wdn6kgy3
89
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
90
+ warnings.warn(
91
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
92
+ warnings.warn(
93
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
94
+ warnings.warn(
95
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
96
+ warnings.warn(
97
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
98
+ warnings.warn(
99
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
100
+ warnings.warn(
101
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
102
+ warnings.warn(
103
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
104
+ warnings.warn(
105
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
106
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
107
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
108
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
109
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
110
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
111
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
112
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
113
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
114
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
115
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
116
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
117
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
118
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
119
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
120
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
121
+ wandb:
122
+ wandb:
123
+ wandb: Run history:
124
+ wandb: epoch β–β–β–β–β–β–‚β–‚β–‚β–‚β–ƒβ–ƒβ–ƒβ–„β–„β–„β–„β–„β–„β–„β–…β–…β–…β–…β–…β–…β–†β–†β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆ
125
+ wandb: epoch_train_time β–ˆβ–β–β–β–β–‚β–‚β–‚β–β–β–‚β–‚β–β–β–β–β–β–β–‚β–‚β–β–β–β–β–β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–
126
+ wandb: loss β–ˆβ–‡β–†β–†β–†β–„β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–β–β–β–
127
+ wandb: lr β–‚β–„β–„β–…β–…β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–‡β–†β–†β–†β–†β–…β–…β–…β–„β–„β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–
128
+ wandb: total_train_time β–β–β–β–β–β–‚β–‚β–‚β–‚β–‚β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–„β–„β–„β–„β–„β–…β–…β–…β–…β–†β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆβ–ˆ
129
+ wandb: val_acc1 β–β–ƒβ–…β–…β–…β–…β–…β–…β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ
130
+ wandb: val_acc5 β–β–ƒβ–…β–†β–†β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–ˆβ–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ
131
+ wandb: val_loss β–ˆβ–…β–„β–ƒβ–ƒβ–ƒβ–‚β–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–
132
+ wandb:
133
+ wandb: Run summary:
134
+ wandb: epoch 90
135
+ wandb: epoch_train_time 92.50205
136
+ wandb: loss 1.77242
137
+ wandb: lr 1e-05
138
+ wandb: total_train_time 8475.51776
139
+ wandb: val_acc1 77.444
140
+ wandb: val_acc5 93.642
141
+ wandb: val_loss 1.95482
142
+ wandb:
143
+ [rank1]:[W114 00:53:43.628969837 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
144
+ [rank2]:[W114 00:53:43.646098834 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
145
+ [rank4]:[W114 00:53:43.647502784 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
146
+ [rank7]:[W114 00:53:43.652868377 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
147
+ [rank5]:[W114 00:53:43.654237363 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
148
+ [rank6]:[W114 00:53:43.680114338 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
149
+ wandb: πŸš€ View run 3396814 at: https://wandb.ai/zesen/decent-sam/runs/wdn6kgy3
150
+ wandb: ⭐️ View project at: https://wandb.ai/zesen/decent-sam
151
+ wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
152
+ wandb: Find logs at: /local/tmp.3396814/wandb/run-20250113_222758-wdn6kgy3/logs
153
+ [rank3]:[W114 00:53:43.809175777 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
154
+ [rank0]:[W114 00:53:46.656309667 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
155
+ I0114 00:53:53.886000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
156
+ I0114 00:53:53.889000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
157
+ I0114 00:53:59.386000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
158
+ I0114 00:53:59.390000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
159
+ I0114 00:53:59.392000 4515 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.0006356239318847656 seconds
160
+ I0114 00:53:59.392000 4639 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 5.502547264099121 seconds
161
+ [W114 00:53:59.606721821 TCPStore.cpp:131] [c10d] recvVector failed on SocketImpl(fd=4, addr=[alvis7-12.int.private]:59548, remote=[alvis7-11.int.private]:28052): failed to recv, got 0 bytes
162
+ Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:670 (most recent call first):
163
+ frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x14f24b8ad446 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libc10.so)
164
+ frame #1: <unknown function> + 0x5fec818 (0x14f2868ef818 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
165
+ frame #2: <unknown function> + 0x5fece49 (0x14f2868efe49 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
166
+ frame #3: <unknown function> + 0x5fefd67 (0x14f2868f2d67 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
167
+ frame #4: c10d::TCPStore::compareSet(std::string const&, std::vector<unsigned char, std::allocator<unsigned char> > const&, std::vector<unsigned char, std::allocator<unsigned char> > const&) + 0x254 (0x14f2868ec5e4 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
168
+ frame #5: <unknown function> + 0xd79664 (0x14f2962e1664 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
169
+ frame #6: <unknown function> + 0x4cc1e3 (0x14f295a341e3 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
170
+ frame #7: <unknown function> + 0x224588 (0x56096c5b9588 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
171
+ frame #8: _PyObject_MakeTpCall + 0x2bb (0x56096c59975b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
172
+ frame #9: <unknown function> + 0x251777 (0x56096c5e6777 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
173
+ frame #10: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
174
+ frame #11: _PyObject_FastCallDictTstate + 0x1ee (0x56096c59c2fe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
175
+ frame #12: _PyObject_Call_Prepend + 0xe9 (0x56096c5c7739 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
176
+ frame #13: <unknown function> + 0x30364b (0x56096c69864b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
177
+ frame #14: _PyObject_Call + 0xb5 (0x56096c5ca135 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
178
+ frame #15: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
179
+ frame #16: PyEval_EvalCode + 0xa1 (0x56096c64f741 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
180
+ frame #17: <unknown function> + 0x2def1a (0x56096c673f1a in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
181
+ frame #18: <unknown function> + 0x2d9d35 (0x56096c66ed35 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
182
+ frame #19: <unknown function> + 0x2f2780 (0x56096c687780 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
183
+ frame #20: _PyRun_SimpleFileObject + 0x1ce (0x56096c686dfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
184
+ frame #21: _PyRun_AnyFileObject + 0x44 (0x56096c686ac4 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
185
+ frame #22: Py_RunMain + 0x2fe (0x56096c67fdfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
186
+ frame #23: Py_BytesMain + 0x37 (0x56096c63a0c7 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
187
+ frame #24: __libc_start_main + 0xe5 (0x14f297714d85 in /lib64/libc.so.6)
188
+ frame #25: <unknown function> + 0x2a4f71 (0x56096c639f71 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
189
+
190
+ W0114 00:53:59.426000 4515 site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1282] The node 'alvis7-12.int.private_4515_0' has failed to shutdown the rendezvous '26983' due to an error of type RendezvousConnectionError.
191
+ [W114 00:53:59.621070642 TCPStore.cpp:122] [c10d] sendBytes failed on SocketImpl(fd=4, addr=[alvis7-12.int.private]:59548, remote=[alvis7-11.int.private]:28052): Broken pipe
192
+ Exception raised from sendBytes at ../torch/csrc/distributed/c10d/Utils.hpp:645 (most recent call first):
193
+ frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x14f24b8ad446 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libc10.so)
194
+ frame #1: <unknown function> + 0x5fecb29 (0x14f2868efb29 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
195
+ frame #2: c10d::TCPStore::compareSet(std::string const&, std::vector<unsigned char, std::allocator<unsigned char> > const&, std::vector<unsigned char, std::allocator<unsigned char> > const&) + 0x22d (0x14f2868ec5bd in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
196
+ frame #3: <unknown function> + 0xd79664 (0x14f2962e1664 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
197
+ frame #4: <unknown function> + 0x4cc1e3 (0x14f295a341e3 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
198
+ frame #5: <unknown function> + 0x224588 (0x56096c5b9588 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
199
+ frame #6: _PyObject_MakeTpCall + 0x2bb (0x56096c59975b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
200
+ frame #7: <unknown function> + 0x251777 (0x56096c5e6777 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
201
+ frame #8: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
202
+ frame #9: _PyObject_FastCallDictTstate + 0x1ee (0x56096c59c2fe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
203
+ frame #10: _PyObject_Call_Prepend + 0xe9 (0x56096c5c7739 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
204
+ frame #11: <unknown function> + 0x30364b (0x56096c69864b in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
205
+ frame #12: _PyObject_Call + 0xb5 (0x56096c5ca135 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
206
+ frame #13: <unknown function> + 0x113339 (0x56096c4a8339 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
207
+ frame #14: PyEval_EvalCode + 0xa1 (0x56096c64f741 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
208
+ frame #15: <unknown function> + 0x2def1a (0x56096c673f1a in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
209
+ frame #16: <unknown function> + 0x2d9d35 (0x56096c66ed35 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
210
+ frame #17: <unknown function> + 0x2f2780 (0x56096c687780 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
211
+ frame #18: _PyRun_SimpleFileObject + 0x1ce (0x56096c686dfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
212
+ frame #19: _PyRun_AnyFileObject + 0x44 (0x56096c686ac4 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
213
+ frame #20: Py_RunMain + 0x2fe (0x56096c67fdfe in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
214
+ frame #21: Py_BytesMain + 0x37 (0x56096c63a0c7 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
215
+ frame #22: __libc_start_main + 0xe5 (0x14f297714d85 in /lib64/libc.so.6)
216
+ frame #23: <unknown function> + 0x2a4f71 (0x56096c639f71 in /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/bin/python3.12)
217
+
218
+ W0114 00:53:59.431000 4515 site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1282] The node 'alvis7-12.int.private_4515_0' has failed to shutdown the rendezvous '26983' due to an error of type RendezvousConnectionError.
DSGDm-8-complete/3396814/log.out ADDED
The diff for this file is too large to render. See raw diff
 
DSGDm-8-complete/3396814/tb_trace/worker_00.1736803741716515488.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d867680757a0af1da55d1bb1b26e71571e5f33e51061116c8138855ae57b78b9
3
+ size 15061671
DSGDm-8-complete/3396814/tb_trace/worker_01.1736803741715256737.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcb81cff5272a083ce11fb37bb154dbc20a8dd7e01f87d781c88ec68a4f3ca9a
3
+ size 15053264
DSGDm-8-complete/3396814/tb_trace/worker_02.1736803741714535034.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf3ac376a3ee5e827c0066060ab79c5abd04700ab961ebb8103fdfb29c4a7a9
3
+ size 15106134
DSGDm-8-complete/3396814/tb_trace/worker_03.1736803741708035584.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5245069681853ea95ecbc6b0a55fbebe085f198e78ea06bb0f5d0cf8ae2d9312
3
+ size 15080397
DSGDm-8-complete/3396814/tb_trace/worker_04.1736803741713051100.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ec09c2cf4722fccf10c329c939c497480b3e46fc49bc6333cf55ed210bd7798
3
+ size 14997367
DSGDm-8-complete/3396814/tb_trace/worker_05.1736803741713040608.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83d7ba8c90eb93d926e058b361113252625044d61a01e9274838a98cd9bea799
3
+ size 15104925
DSGDm-8-complete/3396814/tb_trace/worker_06.1736803741713039723.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bed434ffdebf76928d8754ede222dd163baf3dd0a0a8e5960b3efc1681ee1f3
3
+ size 15118939
DSGDm-8-complete/3396814/tb_trace/worker_07.1736803741713054925.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e56f85f81b7fa09356f71da49848ae5214750307ec491539e80206aad2bd9dfd
3
+ size 15064239
DSGDm-8-complete/3396814/test_results.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ epoch,val_loss,val_acc1,val_acc5,val_samples
2
+ 45.0,2.243170656890869,68.94799997314453,89.25399997314453,50000.0
3
+ 90.0,1.9342285956573486,77.42600001464844,93.64999999023438,50000.0
DSGDm-8-complete/3396814/train_cfg.dump.toml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size = 1024
2
+ max_epochs = 90
3
+ lr = 1.0
4
+ label_smoothing = 0.1
5
+ grad_clip_norm = 0.0
6
+ checkpoint_dir = ""
7
+ arch = "resnet50"
8
+ use_amp = true
9
+ num_samples_for_stats = 102400
10
+ batch_size_per_local_batch = 128
11
+
12
+ [backend]
13
+ name = "decent-dp"
14
+ topology = "complete"
15
+
16
+ [preprocess]
17
+ preload_local = true
18
+ interpolation = "bilinear"
19
+ train_crop_size = 176
20
+ val_image_size = 256
21
+ val_crop_size = 224
22
+
23
+ [optim]
24
+ name = "sgd"
25
+ weight_decay = 3.0517578125e-05
26
+ momentum = 0.875
27
+
28
+ [lr_scheduler]
29
+ name = "cosine"
30
+ warmup_epochs = 5
31
+ warmup_decay = 0.01
32
+ eta_min = 1e-05
33
+
34
+ [reproduce]
35
+ seed = 810975
36
+
37
+ [log]
38
+ log_freq = 100
39
+ wandb_on = true
40
+ wandb_project = "decent-sam"
41
+ checkpoint_freq = 45
42
+ job_id = "3396814"
43
+ log_dir = "/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396814"
44
+
45
+ [network]
46
+ world_size = 8
47
+ rank = 0
48
+ local_rank = 0
49
+ local_world_size = 4
50
+ node_list = "alvis7-[11-12]"
DSGDm-8-complete/3396814/train_cfg.toml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size = 1024
2
+ max_epochs = 90
3
+ lr = 1.0
4
+ label_smoothing = 0.1
5
+ arch = "resnet50"
6
+ use_amp = true
7
+
8
+ [backend]
9
+ name = 'decent-dp'
10
+ topology = 'complete'
11
+
12
+ [preprocess]
13
+ preload_local = true
14
+ interpolation = "bilinear"
15
+ train_crop_size = 176
16
+ val_image_size = 256
17
+ val_crop_size = 224
18
+
19
+ [optim]
20
+ name = 'sgd'
21
+ momentum = 0.875
22
+ weight_decay = 0.000030517578125
23
+
24
+ [lr_scheduler]
25
+ name = 'cosine'
26
+ warmup_epochs = 5
27
+ warmup_decay = 0.01
28
+
29
+ [reproduce]
30
+ seed = 810975
31
+
32
+ [log]
33
+ log_freq = 100
34
+ wandb_on = true
35
+ wandb_project = "decent-sam"
36
+ checkpoint_freq = 45
DSGDm-8-complete/3396814/train_log.csv ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,step,train_loss,val_loss,val_acc1,val_acc5,time,checkpoint_dir
2
+ 1,1251,6.127042294263268,5.393187141819,9.415999994888306,24.01799998046875,171.02069878578186,
3
+ 2,2502,4.919748115072624,4.310746669931412,23.879999983062746,48.737999978942874,264.15189838409424,
4
+ 3,3753,4.192763880550338,3.8052967199516297,33.935999976348874,60.11399998168945,356.6551320552826,
5
+ 4,5004,3.8018151840908256,3.5090049266910555,39.88599997238159,66.44000001068115,448.89388370513916,
6
+ 5,6255,3.5797099125899856,3.543133006410599,39.961999980163576,65.75999998565673,540.9955537319183,
7
+ 6,7506,3.4164077854223196,3.179875017986298,46.81599998474121,73.16600000091553,632.9472050666809,
8
+ 7,8757,3.2814336923434198,3.027001599264145,51.16999999053955,76.78600000061036,725.122312784195,
9
+ 8,10008,3.1878556112448373,2.947021433906555,52.11799996795654,77.601999977417,817.4653673171997,
10
+ 9,11259,3.1188443166365345,2.863088867096901,54.86399998687744,79.31199996948243,909.805721282959,
11
+ 10,12510,3.0681457159568746,2.843212861433029,55.32599997894287,79.95799997955322,1002.1738886833191,
12
+ 11,13761,3.0237617943403152,2.7861382486343382,55.98199997192383,80.5299999899292,1101.837260723114,
13
+ 12,15012,2.9884546110384185,2.850523549041748,55.24799998901367,79.8479999710083,1193.7801671028137,
14
+ 13,16263,2.9541580187235708,2.849518372297287,54.727999989013675,79.49399997619629,1285.6997277736664,
15
+ 14,17514,2.9270401369515273,2.7073931032371523,58.00999997772217,82.0199999609375,1385.2878336906433,
16
+ 15,18765,2.902910311587995,2.7079979345607756,57.44399997161865,81.63,1477.1498582363129,
17
+ 16,20016,2.8812622801720096,2.7070354832077026,58.13599995605469,81.85399996612549,1568.9361598491669,
18
+ 17,21267,2.858700812267933,2.676149680290222,59.13600000061035,82.65199996429443,1668.345825433731,
19
+ 18,22518,2.8411981567537947,2.698109716157913,58.33599996612549,82.16799997070312,1760.14688038826,
20
+ 19,23769,2.8249180473440845,2.7167605709171294,58.15199996765137,82.05399999267578,1852.1880807876587,
21
+ 20,25020,2.807371502704948,2.5683171302080154,61.2079999899292,84.14799996917725,1944.4754102230072,
22
+ 21,26271,2.793356894207992,2.7078482171201705,59.51799998657226,82.68399997192383,2036.822396993637,
23
+ 22,27522,2.777170517819105,2.637347710971832,60.355999951171874,83.30399999267578,2129.101192712784,
24
+ 23,28773,2.764402003930532,2.6262375885868074,60.78399997314453,83.60599996368408,2221.2840523719788,
25
+ 24,30024,2.7518614687317378,2.6395135861110686,60.178,83.71399999511719,2313.138547182083,
26
+ 25,31275,2.73986719895324,2.5539086744832993,62.165999970703126,84.8519999557495,2404.9164850711823,
27
+ 26,32526,2.727668600378753,2.5298820478487016,62.27799997131348,85.00399998077393,2496.6911010742188,
28
+ 27,33777,2.712991842572733,2.527021091747284,62.38000000396728,85.23199998077392,2596.0159952640533,
29
+ 28,35028,2.702992696341851,2.5217206066703794,62.265999991149904,84.96999999267578,2687.8102848529816,
30
+ 29,36279,2.690347477781782,2.5366484935188294,62.305999961853026,84.85999997558594,2779.9322304725647,
31
+ 30,37530,2.677466812989504,2.4979143647289277,63.065999974060055,85.58599998840332,2879.574129343033,
32
+ 31,38781,2.66630786688303,2.4680560279369352,63.2979999899292,85.91599999084472,2971.3960711956024,
33
+ 32,40032,2.6550243053076077,2.4834610175657272,64.3399999710083,86.23999998870849,3063.2164845466614,
34
+ 33,41283,2.6426750808406316,2.522367994060516,63.09799995697021,85.41600000946045,3162.8338882923126,
35
+ 34,42534,2.631698535429202,2.5459611785078047,62.13799999328613,84.60599998291016,3254.787368297577,
36
+ 35,43785,2.618261702531438,2.4807283532619477,63.8999999923706,85.79000000671387,3346.574568748474,
37
+ 36,45036,2.6088119867941937,2.4904993689632415,63.58199998687744,85.90799997528076,3438.6902253627777,
38
+ 37,46287,2.5972135505563827,2.5077360757446288,63.55999998168945,85.77999996917724,3530.874635219574,
39
+ 38,47538,2.585281231801668,2.4784041890478132,63.43799998626709,85.66799996368408,3623.1314704418182,
40
+ 39,48789,2.5742120447395136,2.5087062915229796,63.097999965515136,85.6260000012207,3715.4180703163147,
41
+ 40,50040,2.5596383893423136,2.365015663309097,66.00799998535156,87.39399998596191,3807.6838648319244,
42
+ 41,51291,2.5478728759012443,2.3786921441936495,65.76999997833252,87.38799997802734,3899.9565839767456,
43
+ 42,52542,2.5356153436273123,2.511224712333679,63.1599999609375,85.66799996673583,3992.22243142128,
44
+ 43,53793,2.5232722489096275,2.4498701629924775,64.17599996307374,86.299999972229,4091.8122646808624,
45
+ 44,55044,2.5083332883296823,2.3946868074226377,66.29799998901368,87.46799999145507,4183.590287208557,
46
+ 45,56295,2.4968910682067977,2.4412733810186387,64.58999997131347,86.41799998840332,4275.411991596222,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396814/checkpoints/model_45.pt
47
+ 46,57546,2.484610610990692,2.3499680482530594,67.20599997772217,88.08799995697021,4374.895538806915,
48
+ 47,58797,2.4715520979212724,2.4080028086328507,65.50199999725342,87.07999996765136,4466.660610198975,
49
+ 48,60048,2.458013077517398,2.3507400181007387,66.89799997680664,88.02199997772217,4558.6080548763275,
50
+ 49,61299,2.443213808820021,2.332980908880234,67.02399999145509,88.00199996185303,4658.193074226379,
51
+ 50,62550,2.428716633293173,2.2817089555072783,68.12199998565674,88.84199997833252,4750.039880990982,
52
+ 51,63801,2.4128139926899346,2.299946264133453,68.16999994476318,88.43199996490479,4841.821264743805,
53
+ 52,65052,2.4001326122634605,2.2698288248491285,68.96199997497558,89.1659999835205,4933.92874789238,
54
+ 53,66303,2.3864082657485652,2.3503749481487275,67.21599998535156,88.08799996429444,5025.790977239609,
55
+ 54,67554,2.3690470303658198,2.252627236652374,69.21599998657227,89.2879999786377,5117.707024812698,
56
+ 55,68805,2.352203613217119,2.2708754269838334,68.88399995697021,89.0299999761963,5209.992288351059,
57
+ 56,70056,2.336971477007599,2.2444003923130036,69.33399998413086,89.4659999786377,5302.660699605942,
58
+ 57,71307,2.322869612182454,2.252038153681755,69.87599996826172,89.66199996795655,5394.947804689407,
59
+ 58,72558,2.303248344684581,2.2128678690290453,70.1979999710083,89.9659999520874,5486.980106830597,
60
+ 59,73809,2.288970369586556,2.2328822772312162,70.2059999810791,89.95999995697021,5586.585511922836,
61
+ 60,75060,2.2723432064032574,2.2107101944732666,70.17399997802734,89.99799996246338,5678.411100149155,
62
+ 61,76311,2.2535721100777457,2.1873321256637572,71.1299999710083,90.43199996307374,5770.216231584549,
63
+ 62,77562,2.2356884876410548,2.194494543762207,70.99799998138428,90.36199998687744,5869.540560007095,
64
+ 63,78813,2.2184946567749235,2.1861774908304215,70.94199997589111,90.51399994659424,5961.295742034912,
65
+ 64,80064,2.2009496229777423,2.1907457696056367,71.15199998352051,90.33199999176026,6053.159170150757,
66
+ 65,81315,2.1804827656820236,2.2064537956762313,70.41599998168945,89.87799998687744,6152.669972419739,
67
+ 66,82566,2.1623485805891116,2.1926137644958494,71.59399997406005,90.46799996826172,6244.473973035812,
68
+ 67,83817,2.141807143946441,2.1369869295930863,72.65399998504638,91.18199997924805,6336.294122695923,
69
+ 68,85068,2.121135162935554,2.112554798183441,73.03400000091553,91.20199997955322,6428.506133556366,
70
+ 69,86319,2.1001662009244533,2.141602432551384,72.56600002227783,90.95999997650146,6520.811984539032,
71
+ 70,87570,2.0805942892885323,2.0747684485530855,73.6819999633789,91.84599995513916,6613.024798870087,
72
+ 71,88821,2.058419024343971,2.0777628285312653,73.84399996063233,91.9539999633789,6705.051984786987,
73
+ 72,90072,2.0374853375504056,2.076268817982674,74.12999997619629,91.99999996307373,6796.896181344986,
74
+ 73,91323,2.0162740735579834,2.091451440925598,73.89199997924804,91.98399995239258,6888.699318885803,
75
+ 74,92574,1.9920739202405051,2.0530714088726043,74.39799999786376,92.1759999685669,6980.464948892593,
76
+ 75,93825,1.9711303551920312,2.0350256051921845,74.98399995147705,92.43399996032714,7079.879634380341,
77
+ 76,95076,1.9504838722000877,2.0443485179948806,75.1579999874878,92.36399995239258,7171.66074180603,
78
+ 77,96327,1.929455490623542,2.011657994995117,75.62199999267578,92.82799995788574,7263.498735189438,
79
+ 78,97578,1.908993336508314,2.0061387955236434,76.07999998199463,92.93799995239257,7362.967435836792,
80
+ 79,98829,1.8885678461463236,2.0084419351196288,76.15599997406005,93.011999949646,7454.897849321365,
81
+ 80,100080,1.867487807222884,1.989534666991234,76.48199998199463,93.1919999472046,7546.6971027851105,
82
+ 81,101331,1.8502448157583782,1.97987747964859,76.56599999542236,93.25399994720459,7646.266963481903,
83
+ 82,102582,1.8327934436684699,1.9752510207366942,76.84799997680663,93.369999944458,7738.12802696228,
84
+ 83,103833,1.8193008570934084,1.9674947014093398,77.00599994995117,93.42199994720458,7830.075203418732,
85
+ 84,105084,1.8070869094652715,1.9616766112709045,77.21199997406006,93.51199994445801,7922.026429891586,
86
+ 85,106335,1.7945735428116019,1.9585169447088242,77.2159999822998,93.61799994171143,8014.047864198685,
87
+ 86,107586,1.7852201788164348,1.9618321807050705,77.25799998504638,93.58599994171142,8106.137816429138,
88
+ 87,108837,1.7775084370975014,1.9541121759557725,77.33799998474122,93.62399995269776,8198.392191886902,
89
+ 88,110088,1.773624649925007,1.9565625371456146,77.40199996063232,93.605999944458,8290.748585700989,
90
+ 89,111339,1.7710741437096105,1.9537402257823944,77.37199999572753,93.617999944458,8383.015711069107,
91
+ 90,112590,1.768616415947366,1.9548244094467162,77.44399997680664,93.64199994445801,8475.517761468887,/mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/work/decent-sam/log/3396814/checkpoints/model_90.pt
DSGDm-8-complete/3396815/checkpoints/model_45.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:245e03e1a3d6c8948c3e217637a60ae49f699aede3528820ccdddfa3065c8abd
3
+ size 102518166
DSGDm-8-complete/3396815/checkpoints/model_90.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43665e2f00ab308acaaaa4a2a19febf3ce07060d599bde51c78e27b8f88c7cd7
3
+ size 102518166
DSGDm-8-complete/3396815/data_cfg.dump.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_dir = "./data/Imagenet"
2
+ num_classes = 1000
3
+
4
+ [dataloader]
5
+ name = "ffcv"
6
+ processed_data_dir = "./data/ffcv"
7
+ max_resolution = 500
8
+ compress_probability = 1.0
9
+ jpeg_quality = 90
10
+ num_data_workers = 12
11
+ in_memory = true
12
+ tag = "ffcv_500_1.000_90"
13
+ train_data_dir = "./data/ffcv/ffcv_500_1.000_90_train.ffcv"
14
+ val_data_dir = "./data/ffcv/ffcv_500_1.000_90_val.ffcv"
DSGDm-8-complete/3396815/data_cfg.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_dir = "./data/Imagenet"
2
+ num_classes = 1000
3
+
4
+ [dataloader]
5
+ name = "ffcv"
6
+ processed_data_dir = "./data/ffcv"
7
+ max_resolution = 500
8
+ compress_probability = 1.0
9
+ jpeg_quality = 90
10
+ num_data_workers = 12
11
+
12
+ # [dataloader]
13
+ # name = "dali"
14
+ # preload = true
15
+ # sharded_data_dir = "./data/Imagenet-sharded"
16
+ # num_data_workers = 8
DSGDm-8-complete/3396815/err.out ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793]
2
+ W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793] *****************************************
3
+ W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ W0113 22:27:34.047000 4590 site-packages/torch/distributed/run.py:793] *****************************************
5
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
6
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
7
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
8
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
9
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
10
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] run_id : 28584
11
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
12
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.171:28052
13
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
14
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
15
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
16
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396815/torchelastic_ii5e9hc5
17
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
18
+ I0113 22:27:34.048000 4590 site-packages/torch/distributed/launcher/api.py:194]
19
+ W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793]
20
+ W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793] *****************************************
21
+ W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
22
+ W0113 22:27:34.107000 4547 site-packages/torch/distributed/run.py:793] *****************************************
23
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs:
24
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] entrypoint : src.train_decent
25
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] min_nodes : 2
26
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] max_nodes : 2
27
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] nproc_per_node : 4
28
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] run_id : 28584
29
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] rdzv_backend : c10d
30
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.21.30.171:28052
31
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900}
32
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] max_restarts : 0
33
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] monitor_interval : 0.1
34
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] log_dir : /local/tmp.3396815/torchelastic_lgc1b5s_
35
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194] metrics_cfg : {}
36
+ I0113 22:27:34.108000 4547 site-packages/torch/distributed/launcher/api.py:194]
37
+ I0113 22:27:34.122000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
38
+ I0113 22:27:34.122000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
39
+ I0113 22:27:34.545000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python3.12
40
+ I0113 22:27:34.546000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group
41
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
42
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
43
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.171
44
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
45
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=0
46
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
47
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
48
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[0, 1, 2, 3]
49
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[0, 1, 2, 3]
50
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
51
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
52
+ I0113 22:27:35.393000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:525]
53
+ I0113 22:27:35.394000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
54
+ I0113 22:27:35.394000 4547 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
55
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result:
56
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] restart_count=0
57
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_addr=10.21.30.171
58
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] master_port=28052
59
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_rank=1
60
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] group_world_size=2
61
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3]
62
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_ranks=[4, 5, 6, 7]
63
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_ranks=[4, 5, 6, 7]
64
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[8, 8, 8, 8]
65
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[8, 8, 8, 8]
66
+ I0113 22:27:35.394000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:525]
67
+ I0113 22:27:35.395000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group
68
+ I0113 22:27:35.395000 4547 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
69
+ I0113 22:27:35.395000 4547 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
70
+ I0113 22:27:35.395000 4590 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True
71
+ I0113 22:27:35.395000 4590 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer.
72
+ I0113 22:27:35.396000 4590 site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check.
73
+ [rank0]:[W113 22:27:49.888137996 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
74
+ [rank3]:[W113 22:27:49.888186841 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
75
+ [rank1]:[W113 22:27:49.888231426 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
76
+ [rank2]:[W113 22:27:49.888452742 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
77
+ [rank5]:[W113 22:27:50.531696251 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
78
+ [rank6]:[W113 22:27:50.532009107 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
79
+ [rank4]:[W113 22:27:50.532444240 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
80
+ [rank7]:[W113 22:27:50.532668073 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
81
+ wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
82
+ wandb: Currently logged in as: zesen. Use `wandb login --relogin` to force relogin
83
+ wandb: Tracking run with wandb version 0.19.1
84
+ wandb: Run data is saved locally in /local/tmp.3396815/wandb/run-20250113_222756-xtm2f8go
85
+ wandb: Run `wandb offline` to turn off syncing.
86
+ wandb: Syncing run 3396815
87
+ wandb: ⭐️ View project at https://wandb.ai/zesen/decent-sam
88
+ wandb: πŸš€ View run at https://wandb.ai/zesen/decent-sam/runs/xtm2f8go
89
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
90
+ warnings.warn(
91
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
92
+ warnings.warn(
93
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
94
+ warnings.warn(
95
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
96
+ warnings.warn(
97
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
98
+ warnings.warn(
99
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
100
+ warnings.warn(
101
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
102
+ warnings.warn(
103
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:224: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
104
+ warnings.warn(
105
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
106
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
107
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
108
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
109
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
110
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
111
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
112
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
113
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
114
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
115
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
116
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
117
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
118
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
119
+ /mimer/NOBACKUP/groups/naiss2024-23-12/users/zesenw/env/miniforge3/envs/resnet/lib/python3.12/site-packages/torch/optim/lr_scheduler.py:240: UserWarning: The epoch parameter in `scheduler.step()` was not necessary and is being deprecated where possible. Please use `scheduler.step()` to step the scheduler. During the deprecation, if epoch is different from None, the closed form is used instead of the new chainable form, where available. Please open an issue if you are unable to replicate your use case: https://github.com/pytorch/pytorch/issues/new/choose.
120
+ warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
121
+ wandb: uploading config.yaml; uploading output.log
122
+ wandb:
123
+ [rank1]:[W114 00:53:39.318429238 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
124
+ [rank6]:[W114 00:53:39.385046968 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
125
+ [rank7]:[W114 00:53:39.411510035 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
126
+ [rank5]:[W114 00:53:39.417334380 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
127
+ [rank2]:[W114 00:53:39.476722623 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
128
+ [rank3]:[W114 00:53:39.594174452 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
129
+ wandb:
130
+ wandb: Run history:
131
+ wandb: epoch β–β–β–β–‚β–‚β–‚β–‚β–‚β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–„β–„β–„β–…β–…β–…β–…β–…β–…β–…β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–ˆ
132
+ wandb: epoch_train_time β–ˆβ–β–β–β–β–‚β–β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–‚β–β–‚β–β–β–β–β–β–‚β–β–‚β–β–β–β–β–β–β–
133
+ wandb: loss β–ˆβ–‡β–†β–†β–†β–†β–…β–…β–…β–…β–…β–…β–…β–…β–…β–„β–„β–„β–„β–„β–„β–„οΏ½οΏ½β–„β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–β–β–β–β–
134
+ wandb: lr β–β–„β–…β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–‡β–‡β–‡β–†β–†β–†β–…β–…β–„β–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–β–β–
135
+ wandb: total_train_time β–β–β–β–β–‚β–‚β–‚β–‚β–‚β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–„β–„β–„β–„β–„β–„β–„β–…β–…β–…β–…β–…β–†β–†β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆ
136
+ wandb: val_acc1 β–β–‚β–…β–…β–…β–†β–†β–†β–†β–†β–†β–†β–†β–‡β–†β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–ˆβ–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ
137
+ wandb: val_acc5 β–β–‚β–…β–…β–†β–†β–†β–†β–†β–†β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ
138
+ wandb: val_loss β–ˆβ–…β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–β–β–β–β–β–β–β–
139
+ wandb:
140
+ wandb: Run summary:
141
+ wandb: epoch 90
142
+ wandb: epoch_train_time 92.11154
143
+ wandb: loss 1.76962
144
+ wandb: lr 1e-05
145
+ wandb: total_train_time 8472.76438
146
+ wandb: val_acc1 77.424
147
+ wandb: val_acc5 93.626
148
+ wandb: val_loss 1.95212
149
+ wandb:
150
+ wandb: πŸš€ View run 3396815 at: https://wandb.ai/zesen/decent-sam/runs/xtm2f8go
151
+ wandb: ⭐️ View project at: https://wandb.ai/zesen/decent-sam
152
+ wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
153
+ wandb: Find logs at: /local/tmp.3396815/wandb/run-20250113_222756-xtm2f8go/logs
154
+ [rank4]:[W114 00:53:40.296904513 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
155
+ [rank0]:[W114 00:53:42.625334345 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
156
+ I0114 00:53:49.018000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
157
+ I0114 00:53:49.020000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
158
+ I0114 00:53:50.667000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish.
159
+ I0114 00:53:50.671000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish
160
+ I0114 00:53:50.673000 4547 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 1.6517422199249268 seconds
161
+ I0114 00:53:50.673000 4590 site-packages/torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.0016102790832519531 seconds
DSGDm-8-complete/3396815/log.out ADDED
The diff for this file is too large to render. See raw diff
 
DSGDm-8-complete/3396815/tb_trace/worker_00.1736803740196062136.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:698fa85f677b0ee04ff6b110da0ff8dc437f180cd11345309c594d9b3015a2d6
3
+ size 15124021
DSGDm-8-complete/3396815/tb_trace/worker_01.1736803740195517240.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a816ca68f36cdbc4b5077d234efacd35f5896cb3536d72288ba964e0e87299f9
3
+ size 15018027
DSGDm-8-complete/3396815/tb_trace/worker_02.1736803740198629006.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97f5a14852e2cc062db678418494a2a262914e0e4daa513c426bd9307d1bd8f8
3
+ size 14978834
DSGDm-8-complete/3396815/tb_trace/worker_03.1736803740196019080.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef92f5bce4215d5eace0a69adc734a94b040839b29b5ca852e28c1a9551d70ef
3
+ size 15083314
DSGDm-8-complete/3396815/tb_trace/worker_04.1736803740196645296.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d40d4edc76fee681975349672c9fd9742c5fb5caa7d431d29ad64aae3e9b15e6
3
+ size 15064200
DSGDm-8-complete/3396815/tb_trace/worker_05.1736803740196646399.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84653bdf8ff5388deb57bc282a4a29eeb60f5f09317a2169641667ba68319fee
3
+ size 15027919
DSGDm-8-complete/3396815/tb_trace/worker_06.1736803740196720663.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57c12e1c1fbb6be2abfffeca0ae2444f8257cb686852226385d4d252efdf7242
3
+ size 15094896