diff --git "a/Order_Ablations/E115/train.rank0.log" "b/Order_Ablations/E115/train.rank0.log" new file mode 100644--- /dev/null +++ "b/Order_Ablations/E115/train.rank0.log" @@ -0,0 +1,1259 @@ +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +W0127 17:48:37.253000 200820 torch/distributed/run.py:803] +W0127 17:48:37.253000 200820 torch/distributed/run.py:803] ***************************************** +W0127 17:48:37.253000 200820 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0127 17:48:37.253000 200820 torch/distributed/run.py:803] ***************************************** +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. + import pynvml # type: ignore[import] +Trainer._get_train_sampler replaced with custom implementation. +[2026-01-27 17:48:44,068] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Trainer._get_train_sampler replaced with custom implementation. +Trainer._get_train_sampler replaced with custom implementation. +[2026-01-27 17:48:44,922] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Trainer._get_train_sampler replaced with custom implementation. +Trainer._get_train_sampler replaced with custom implementation. +Trainer._get_train_sampler replaced with custom implementation. +[2026-01-27 17:48:44,999] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-01-27 17:48:45,050] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-01-27 17:48:45,062] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-01-27 17:48:45,066] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Trainer._get_train_sampler replaced with custom implementation. +Trainer._get_train_sampler replaced with custom implementation. +[2026-01-27 17:48:45,176] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-01-27 17:48:45,186] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2026-01-27 17:48:45,274] [INFO] [comm.py:658:init_distributed] cdb=None +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2026-01-27 17:48:46,054] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 17:48:46,072] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 17:48:46,115] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 17:48:46,207] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 17:48:46,229] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 17:48:46,243] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 17:48:46,244] [INFO] [comm.py:658:init_distributed] cdb=None +[2026-01-27 17:48:46,244] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + Loading checkpoint shards: 0%| | 0/2 [00:00