diff --git "a/training_log.txt" "b/training_log.txt" new file mode 100644--- /dev/null +++ "b/training_log.txt" @@ -0,0 +1,1906 @@ +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +07/31/2025 07:38:54 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +07/31/2025 07:38:54 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +batch_eval_metrics=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +evaluation_strategy=no, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl3_8b_lora_a40/runs/Jul31_07-38-54_cfb9aa843d58, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_kwargs={}, +lr_scheduler_type=SchedulerType.COSINE, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=work_dirs/internvl3_8b_lora_a40, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=work_dirs/internvl3_8b_lora_a40, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.05, +) +07/31/2025 07:38:54 - INFO - __main__ - Loading Tokenizer: OpenGVLab/InternVL3-8B +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:38:54,220 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/vocab.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:38:54,220 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/merges.txt +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:38:54,220 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/tokenizer.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:38:54,220 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/added_tokens.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:38:54,220 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/special_tokens_map.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:38:54,220 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/tokenizer_config.json +[WARNING|logging.py:314] 2025-07-31 07:38:54,447 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +07/31/2025 07:38:54 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:733] 2025-07-31 07:38:54,483 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/config.json +[INFO|configuration_utils.py:796] 2025-07-31 07:38:54,485 >> Model config InternVLChatConfig { + "_commit_hash": "24dc81a234a6e1901f3314eeadaa2813f2b78038", + "_name_or_path": "/mnt/petrelfs/wangweiyun/workspace_wwy/open_source/InternVL/internvl_chat/work_dirs/internvl_chat_v3_0/InternVL3_0-8B-MPO-try0-2", + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "OpenGVLab/InternVL3-8B--configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "OpenGVLab/InternVL3-8B--modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "OpenGVLab/InternVL3-8B--modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 3584, + "image_fold": null, + "llm_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "./pretrained/Qwen2.5-32B-Instruct", + "add_cross_attention": false, + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151643, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 3584, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 18944, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "max_window_layers": 70, + "min_length": 0, + "model_type": "qwen2", + "moe_config": null, + "no_repeat_ngram_size": 0, + "num_attention_heads": 28, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "factor": 2.0, + "rope_type": "dynamic", + "type": "dynamic" + }, + "rope_theta": 1000000.0, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.41.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151674 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "system_message": null, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_intern_vit.InternVisionConfig", + "AutoModel": "modeling_intern_vit.InternVisionModel" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "capacity_factor": 1.2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.1, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "eval_capacity_factor": 1.4, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 0.1, + "initializer_range": 1e-10, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "laux_allreduce": "all_nodes", + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "moe_coeff_ratio": 0.5, + "moe_intermediate_size": 768, + "moe_output_scale": 4.0, + "no_repeat_ngram_size": 0, + "noisy_gate_policy": "RSample_before", + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_experts": 8, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "num_routed_experts": 4, + "num_shared_experts": 4, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "shared_expert_intermediate_size": 3072, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.41.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true, + "use_moe": false, + "use_residual": true, + "use_rts": false, + "use_weighted_residual": false + } +} + +07/31/2025 07:38:54 - INFO - __main__ - Using flash_attention_2 for LLaMA +[INFO|modeling_utils.py:3474] 2025-07-31 07:38:54,488 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/model.safetensors.index.json +[INFO|modeling_utils.py:1519] 2025-07-31 07:38:54,492 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:962] 2025-07-31 07:38:54,493 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:962] 2025-07-31 07:38:54,544 >> Generate config GenerationConfig { + "bos_token_id": 151643, + "eos_token_id": 151643, + "use_cache": false +} + + Loading checkpoint shards: 0%| | 0/4 [00:00> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4288] 2025-07-31 07:39:01,534 >> All the weights of InternVLChatModel were initialized from the model checkpoint at OpenGVLab/InternVL3-8B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:917] 2025-07-31 07:39:01,571 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/generation_config.json +[INFO|configuration_utils.py:962] 2025-07-31 07:39:01,571 >> Generate config GenerationConfig {} + +07/31/2025 07:39:01 - INFO - __main__ - Finished +07/31/2025 07:39:01 - INFO - __main__ - model.config.force_image_size: 448 +07/31/2025 07:39:01 - INFO - __main__ - data_args.force_image_size: 448 +07/31/2025 07:39:01 - INFO - __main__ - model.config.vision_config.image_size: 448 +07/31/2025 07:39:01 - INFO - __main__ - [Dataset] num_image_token: 256 +07/31/2025 07:39:01 - INFO - __main__ - [Dataset] dynamic_image_size: True +07/31/2025 07:39:01 - INFO - __main__ - [Dataset] use_thumbnail: True +07/31/2025 07:39:01 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +07/31/2025 07:39:01 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[WARNING|tokenization_utils_base.py:3921] 2025-07-31 07:39:01,662 >> Token indices sequence length is longer than the specified maximum sequence length for this model (39388 > 16382). Running this sequence through the model will result in indexing errors +07/31/2025 07:39:19 - INFO - __main__ - Add dataset: custom_ds with length: 198 +trainable params: 322,961,408 || all params: 7,935,782,400 || trainable%: 4.0697 +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.down_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.up_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.up_proj.lora_B.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.down_proj.lora_A.default.weight +07/31/2025 07:39:29 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.down_proj.lora_B.default.weight +[INFO|trainer.py:641] 2025-07-31 07:39:36,192 >> Using auto half precision backend +[INFO|trainer.py:2078] 2025-07-31 07:39:36,701 >> ***** Running training ***** +[INFO|trainer.py:2079] 2025-07-31 07:39:36,701 >> Num examples = 198 +[INFO|trainer.py:2080] 2025-07-31 07:39:36,701 >> Num Epochs = 1 +[INFO|trainer.py:2081] 2025-07-31 07:39:36,701 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:2084] 2025-07-31 07:39:36,701 >> Total train batch size (w. parallel, distributed & accumulation) = 8 +[INFO|trainer.py:2085] 2025-07-31 07:39:36,701 >> Gradient Accumulation steps = 8 +[INFO|trainer.py:2086] 2025-07-31 07:39:36,701 >> Total optimization steps = 24 +[INFO|trainer.py:2087] 2025-07-31 07:39:36,705 >> Number of trainable parameters = 322,961,408 + 0%| | 0/24 [00:00 +[rank0]: main() +[rank0]: File "/workspace/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 1057, in main +[rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 1885, in train +[rank0]: return inner_training_loop( +[rank0]: ^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2216, in _inner_training_loop +[rank0]: tr_loss_step = self.training_step(model, inputs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3250, in training_step +[rank0]: self.accelerator.backward(loss) +[rank0]: File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2013, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 768, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 9.26 GiB. GPU 0 has a total capacity of 44.45 GiB of which 3.91 GiB is free. Process 2931665 has 40.53 GiB memory in use. Of the allocated memory 38.42 GiB is allocated by PyTorch, and 1.48 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + 0%| | 0/24 [00:27 + sys.exit(main()) + ^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper + return f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/torch/distributed/run.py", line 901, in main + run(args) + File "/usr/local/lib/python3.11/dist-packages/torch/distributed/run.py", line 892, in run + elastic_launch( + File "/usr/local/lib/python3.11/dist-packages/torch/distributed/launcher/api.py", line 133, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/torch/distributed/launcher/api.py", line 264, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +internvl/train/internvl_chat_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-07-31_07:40:06 + host : cfb9aa843d58 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 18698) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +07/31/2025 07:41:44 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +07/31/2025 07:41:44 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +batch_eval_metrics=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +evaluation_strategy=no, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=8, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl3_8b_lora_a40/runs/Jul31_07-41-44_cfb9aa843d58, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_kwargs={}, +lr_scheduler_type=SchedulerType.COSINE, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=work_dirs/internvl3_8b_lora_a40, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=work_dirs/internvl3_8b_lora_a40, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.05, +) +07/31/2025 07:41:44 - INFO - __main__ - Loading Tokenizer: OpenGVLab/InternVL3-8B +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:41:44,594 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/vocab.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:41:44,594 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/merges.txt +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:41:44,595 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/tokenizer.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:41:44,595 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/added_tokens.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:41:44,595 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/special_tokens_map.json +[INFO|tokenization_utils_base.py:2108] 2025-07-31 07:41:44,595 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/tokenizer_config.json +[WARNING|logging.py:314] 2025-07-31 07:41:44,835 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +07/31/2025 07:41:44 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:733] 2025-07-31 07:41:44,866 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/config.json +[INFO|configuration_utils.py:796] 2025-07-31 07:41:44,868 >> Model config InternVLChatConfig { + "_commit_hash": "24dc81a234a6e1901f3314eeadaa2813f2b78038", + "_name_or_path": "/mnt/petrelfs/wangweiyun/workspace_wwy/open_source/InternVL/internvl_chat/work_dirs/internvl_chat_v3_0/InternVL3_0-8B-MPO-try0-2", + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "OpenGVLab/InternVL3-8B--configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "OpenGVLab/InternVL3-8B--modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "OpenGVLab/InternVL3-8B--modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 3584, + "image_fold": null, + "llm_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "./pretrained/Qwen2.5-32B-Instruct", + "add_cross_attention": false, + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151643, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 3584, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 18944, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "max_window_layers": 70, + "min_length": 0, + "model_type": "qwen2", + "moe_config": null, + "no_repeat_ngram_size": 0, + "num_attention_heads": 28, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "factor": 2.0, + "rope_type": "dynamic", + "type": "dynamic" + }, + "rope_theta": 1000000.0, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.41.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151674 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "system_message": null, + "template": "internvl2_5", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_intern_vit.InternVisionConfig", + "AutoModel": "modeling_intern_vit.InternVisionModel" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "capacity_factor": 1.2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.1, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "eval_capacity_factor": 1.4, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 0.1, + "initializer_range": 1e-10, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "laux_allreduce": "all_nodes", + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "moe_coeff_ratio": 0.5, + "moe_intermediate_size": 768, + "moe_output_scale": 4.0, + "no_repeat_ngram_size": 0, + "noisy_gate_policy": "RSample_before", + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_experts": 8, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "num_routed_experts": 4, + "num_shared_experts": 4, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "shared_expert_intermediate_size": 3072, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.41.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true, + "use_moe": false, + "use_residual": true, + "use_rts": false, + "use_weighted_residual": false + } +} + +07/31/2025 07:41:44 - INFO - __main__ - Using flash_attention_2 for LLaMA +[INFO|modeling_utils.py:3474] 2025-07-31 07:41:44,871 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/model.safetensors.index.json +[INFO|modeling_utils.py:1519] 2025-07-31 07:41:44,874 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:962] 2025-07-31 07:41:44,875 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:962] 2025-07-31 07:41:44,924 >> Generate config GenerationConfig { + "bos_token_id": 151643, + "eos_token_id": 151643, + "use_cache": false +} + + Loading checkpoint shards: 0%| | 0/4 [00:00> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4288] 2025-07-31 07:41:52,090 >> All the weights of InternVLChatModel were initialized from the model checkpoint at OpenGVLab/InternVL3-8B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:917] 2025-07-31 07:41:52,132 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--OpenGVLab--InternVL3-8B/snapshots/24dc81a234a6e1901f3314eeadaa2813f2b78038/generation_config.json +[INFO|configuration_utils.py:962] 2025-07-31 07:41:52,132 >> Generate config GenerationConfig {} + +07/31/2025 07:41:52 - INFO - __main__ - Finished +07/31/2025 07:41:52 - INFO - __main__ - model.config.force_image_size: 448 +07/31/2025 07:41:52 - INFO - __main__ - data_args.force_image_size: 448 +07/31/2025 07:41:52 - INFO - __main__ - model.config.vision_config.image_size: 448 +07/31/2025 07:41:52 - INFO - __main__ - [Dataset] num_image_token: 256 +07/31/2025 07:41:52 - INFO - __main__ - [Dataset] dynamic_image_size: True +07/31/2025 07:41:52 - INFO - __main__ - [Dataset] use_thumbnail: True +07/31/2025 07:41:52 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +07/31/2025 07:41:52 - INFO - __main__ - Formatting inputs...Skip in lazy mode +[WARNING|tokenization_utils_base.py:3921] 2025-07-31 07:41:52,310 >> Token indices sequence length is longer than the specified maximum sequence length for this model (39388 > 8191). Running this sequence through the model will result in indexing errors +07/31/2025 07:42:08 - INFO - __main__ - Add dataset: custom_ds with length: 198 +trainable params: 322,961,408 || all params: 7,935,782,400 || trainable%: 4.0697 +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.24.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.25.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.26.mlp.down_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.o_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.self_attn.o_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.gate_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.gate_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.up_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.up_proj.lora_B.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.down_proj.lora_A.default.weight +07/31/2025 07:42:18 - INFO - __main__ - language_model.base_model.model.model.layers.27.mlp.down_proj.lora_B.default.weight +[INFO|trainer.py:641] 2025-07-31 07:42:28,356 >> Using auto half precision backend +[INFO|trainer.py:2078] 2025-07-31 07:42:28,796 >> ***** Running training ***** +[INFO|trainer.py:2079] 2025-07-31 07:42:28,796 >> Num examples = 198 +[INFO|trainer.py:2080] 2025-07-31 07:42:28,796 >> Num Epochs = 1 +[INFO|trainer.py:2081] 2025-07-31 07:42:28,796 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:2084] 2025-07-31 07:42:28,796 >> Total train batch size (w. parallel, distributed & accumulation) = 8 +[INFO|trainer.py:2085] 2025-07-31 07:42:28,796 >> Gradient Accumulation steps = 8 +[INFO|trainer.py:2086] 2025-07-31 07:42:28,796 >> Total optimization steps = 24 +[INFO|trainer.py:2087] 2025-07-31 07:42:28,800 >> Number of trainable parameters = 322,961,408 + 0%| | 0/24 [00:00> + +Training completed. Do not forget to share your model on huggingface.co/models =) + + + {'train_runtime': 1207.2389, 'train_samples_per_second': 0.164, 'train_steps_per_second': 0.02, 'train_loss': 0.0, 'epoch': 0.97} + 100%|██████████████████████████████████████████████████████| 24/24 [20:07<00:00, 49.38s/it] 100%|██████████████████████████████████████████████████████| 24/24 [20:07<00:00, 50.30s/it] +[INFO|trainer.py:3410] 2025-07-31 08:02:36,063 >> Saving model checkpoint to work_dirs/internvl3_8b_lora_a40 +[INFO|configuration_utils.py:472] 2025-07-31 08:02:36,073 >> Configuration saved in work_dirs/internvl3_8b_lora_a40/config.json +[INFO|configuration_utils.py:731] 2025-07-31 08:02:36,079 >> Configuration saved in work_dirs/internvl3_8b_lora_a40/generation_config.json +[INFO|modeling_utils.py:2626] 2025-07-31 08:03:27,623 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at work_dirs/internvl3_8b_lora_a40/model.safetensors.index.json. +[INFO|tokenization_utils_base.py:2513] 2025-07-31 08:03:27,633 >> tokenizer config file saved in work_dirs/internvl3_8b_lora_a40/tokenizer_config.json +[INFO|tokenization_utils_base.py:2522] 2025-07-31 08:03:27,637 >> Special tokens file saved in work_dirs/internvl3_8b_lora_a40/special_tokens_map.json +***** train metrics ***** + epoch = 0.9697 + total_flos = 24427775104GF + train_loss = 0.0 + train_runtime = 0:20:07.23 + train_samples = 198 + train_samples_per_second = 0.164 + train_steps_per_second = 0.02 +[rank0]:[W731 08:03:28.207548993 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())