diff --git "a/training_log_truc.txt" "b/training_log_truc.txt" new file mode 100644--- /dev/null +++ "b/training_log_truc.txt" @@ -0,0 +1,2538 @@ +W0805 17:40:32.513000 58575 /mnt/hwfile/liuzhaoyang/workspace/programs/miniconda3/envs/qwen2_5vl/lib/python3.10/site-packages/torch/distributed/run.py:792] +W0805 17:40:32.513000 58575 /mnt/hwfile/liuzhaoyang/workspace/programs/miniconda3/envs/qwen2_5vl/lib/python3.10/site-packages/torch/distributed/run.py:792] ***************************************** +W0805 17:40:32.513000 58575 /mnt/hwfile/liuzhaoyang/workspace/programs/miniconda3/envs/qwen2_5vl/lib/python3.10/site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0805 17:40:32.513000 58575 /mnt/hwfile/liuzhaoyang/workspace/programs/miniconda3/envs/qwen2_5vl/lib/python3.10/site-packages/torch/distributed/run.py:792] ***************************************** +[2025-08-05 17:41:05,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,705] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:05,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-08-05 17:41:13,989] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:13,989] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:13,989] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:13,989] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:13,989] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:13,990] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:13,990] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,005] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,005] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,005] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,005] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,005] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,005] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,005] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,008] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-08-05 17:41:14,008] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-08-05 17:41:14,485] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-08-05 17:41:14,518] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-08-05 17:41:14,599] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-08-05 17:41:14,605] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,607] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-08-05 17:41:14,614] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,615] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,616] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-08-05 17:41:14,621] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-08-05 17:41:14,657] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,659] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,661] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,663] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,663] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +[2025-08-05 17:41:14,664] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-08-05 17:41:14,664] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + Loading checkpoint shards: 0%| | 0/2 [00:00 before Client(conf_path) +Rank 0: --> after Client(conf_path) +Rank 0: Loading datasets: /mnt/petrelfs/liuzhaoyang/workspace/GUIAgent/internvl_chat/data/internvl_meta/science/feiben_materials_250805_1.json +Rank 0: Loading internal_android_planning_cot_20250612 +Rank 0: Skipping internal_android_planning_cot_20250612 due to repeat_time=0 +Rank 0: Loading internal_mac_planning_cot_20250612 +Rank 0: Skipping internal_mac_planning_cot_20250612 due to repeat_time=0 +Rank 0: Loading internal_ubuntu_planning_cot_20250612 +Rank 0: Skipping internal_ubuntu_planning_cot_20250612 due to repeat_time=0 +Rank 0: Loading internal_windows_planning_cot_20250612 +Rank 0: Skipping internal_windows_planning_cot_20250612 due to repeat_time=0 +Rank 0: Loading internal_web_planning_cot_20250612 +Rank 0: Skipping internal_web_planning_cot_20250612 due to repeat_time=0 +Rank 0: Loading internal_ubuntu_planning_cot_boost_action_20250612 +Rank 0: Skipping internal_ubuntu_planning_cot_boost_action_20250612 due to repeat_time=0 +Rank 0: Loading internal_ubuntu_planning_cot_boost_instruction_action_20250612 +Rank 0: Skipping internal_ubuntu_planning_cot_boost_instruction_action_20250612 due to repeat_time=0 +Rank 0: Loading internal_ubuntu_planning_cot_boost_action_20250624 +Rank 0: Skipping internal_ubuntu_planning_cot_boost_action_20250624 due to repeat_time=0 +Rank 0: Loading internal_ubuntu_planning_cot_boost_instruction_action_20250624 +Rank 0: Skipping internal_ubuntu_planning_cot_boost_instruction_action_20250624 due to repeat_time=0 +Rank 0: Loading internal_ubuntu_planning_cot_20250624 +Rank 0: Skipping internal_ubuntu_planning_cot_20250624 due to repeat_time=0 +Rank 0: Loading internal_ubuntu_planning_cot_boost_instruction_20250612 +Rank 0: Loading VC:s3://gui-agent/data_20250612/ubuntu/planning_20250720_boost_instruction.jsonl with random:50% sampling strategy +Rank 0: Loaded 4182 samples from VC:s3://gui-agent/data_20250612/ubuntu/planning_20250720_boost_instruction.jsonl +Rank 0: Loading internal_windows_planning_cot_boost_instruction_20250612 +Rank 0: Loading VC:s3://gui-agent/data_20250612/windows/planning_20250720_boost_instruction.jsonl with all sampling strategy +Rank 0: Loaded 26412 samples from VC:s3://gui-agent/data_20250612/windows/planning_20250720_boost_instruction.jsonl +Rank 0: Loading internal_ubuntu_planning_cot_boost_instruction_20250624 +Rank 0: Loading VC:s3://gui-agent/data_20250624/ubuntu/planning_20250720_boost_instruction.jsonl with random:50% sampling strategy +Rank 0: Loaded 7883 samples from VC:s3://gui-agent/data_20250624/ubuntu/planning_20250720_boost_instruction.jsonl +Rank 0: Loading internal_windows_planning_cot_boost_instruction_20250707 +Rank 0: Loading VC:s3://gui-agent/data_20250707/windows/planning_20250720_boost_instruction.jsonl with all sampling strategy +Rank 0: Loaded 17796 samples from VC:s3://gui-agent/data_20250707/windows/planning_20250720_boost_instruction.jsonl +Rank 0: Loading internal_ubuntu_planning_cot_boost_instruction_20250707 +Rank 0: Loading VC:s3://gui-agent/data_20250707/ubuntu/planning_20250720_boost_instruction.jsonl with random:50% sampling strategy +Rank 0: Loaded 21026 samples from VC:s3://gui-agent/data_20250707/ubuntu/planning_20250720_boost_instruction.jsonl +Rank 0: Loading internal_windows_planning_cot_boost_instruction_20250714 +Rank 0: Loading VC:s3://gui-agent/data_20250714/windows/planning_20250720_boost_instruction.jsonl with all sampling strategy +Rank 0: Loaded 44307 samples from VC:s3://gui-agent/data_20250714/windows/planning_20250720_boost_instruction.jsonl +Rank 0: Loading internal_ubuntu_planning_cot_boost_instruction_20250714 +Rank 0: Loading VC:s3://gui-agent/data_20250714/ubuntu/planning_20250720_boost_instruction.jsonl with random:50% sampling strategy +Rank 0: Loaded 16767 samples from VC:s3://gui-agent/data_20250714/ubuntu/planning_20250720_boost_instruction.jsonl +Rank 0: Loading windows_feiben_material_navigation_20250803 +Rank 0: Loading VC:s3://gui-agent/data_20250803/windows/navigation_20250803.jsonl with repeat:2 sampling strategy +Rank 0: Loaded 5538 samples from VC:s3://gui-agent/data_20250803/windows/navigation_20250803.jsonl +Rank 0: Loading windows_feiben_material_planning_cot_20250803 +Rank 0: Loading VC:s3://gui-agent/data_20250803/windows/planning_20250803.jsonl with repeat:2 sampling strategy +Rank 0: Loaded 5538 samples from VC:s3://gui-agent/data_20250803/windows/planning_20250803.jsonl +Rank 0: Total training samples: 149449 +Rank 0: Formatting inputs...Skip in lazy mode +Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. +Rank 0: Length of multimodal samples: 149440, pure textual samples: 0 +Parameter Offload: Total persistent parameters: 755712 in 408 params + 0%| | 0/2335 [00:00