| """ |
| 修复版模型微调脚本 |
| 核心改进: |
| 1. 鲁棒的标签掩码(只学习assistant的回答)- 最终、最鲁棒修正版 |
| 2. 解决 QwenTokenizer 没有 im_end_id 属性的兼容性问题。 |
| 3. 修复 TypeError: '<=' not supported between instances of 'float' and 'str' 问题。 |
| """ |
| import os |
| import json |
| import yaml |
| import torch |
| from pathlib import Path |
| from dataclasses import dataclass, field |
| from typing import Optional, List |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForCausalLM, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForSeq2Seq, |
| TrainerCallback, |
| ) |
| from peft import LoraConfig, get_peft_model, TaskType |
| from datasets import load_dataset |
| import numpy as np |
|
|
|
|
| @dataclass |
| class ModelArguments: |
| """模型参数""" |
| model_name_or_path: str = field(default="Qwen/Qwen3-8B") |
| use_lora: bool = field(default=True) |
| lora_r: int = field(default=64) |
| lora_alpha: int = field(default=128) |
| lora_dropout: float = field(default=0.05) |
| lora_target_modules: List[str] = field( |
| default_factory=lambda: [ |
| "q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj" |
| ] |
| ) |
|
|
|
|
| @dataclass |
| class DataArguments: |
| """数据参数""" |
| data_dir: str = field(default="./data/training_data") |
| max_length: int = field(default=1024) |
| preprocessing_num_workers: int = field(default=32) |
|
|
|
|
| class SampleInspectionCallback(TrainerCallback): |
| """训练样本检查回调""" |
| def __init__(self, tokenizer): |
| self.tokenizer = tokenizer |
| self.checked = False |
| |
| def on_step_begin(self, args, state, control, **kwargs): |
| """在第一步开始时检查样本""" |
| if not self.checked and state.global_step == 0: |
| print("\n" + "="*60) |
| print("🔍 Inspecting training samples...") |
| print("="*60) |
| self.checked = True |
|
|
|
|
| class QwenFineTunerFixed: |
| """Qwen模型微调器 - 修复版""" |
| config_path = Path(__file__).parent.parent / "config" / "default_config.yaml" |
|
|
| def __init__(self, config_path: str = config_path): |
| with open(config_path, 'r', encoding='utf-8') as f: |
| self.config = yaml.safe_load(f) |
| |
| self.model_args = ModelArguments( |
| model_name_or_path=self.config['model']['base_model'] |
| ) |
| self.data_args = DataArguments( |
| data_dir=self.config['dataset']['output_dir'] |
| ) |
| |
| self.output_dir = Path(self.config['training']['output_dir']) |
| self.output_dir.mkdir(parents=True, exist_ok=True) |
| |
| self.tokenizer = None |
| self.model = None |
| self.train_dataset = None |
| self.eval_dataset = None |
| |
| self.im_end_token_id = None |
| |
| def load_tokenizer_and_model(self): |
| """加载tokenizer和模型""" |
| print(f"Loading tokenizer from {self.model_args.model_name_or_path}") |
| self.tokenizer = AutoTokenizer.from_pretrained( |
| self.model_args.model_name_or_path, |
| trust_remote_code=True, |
| padding_side='right' |
| ) |
| |
| |
| try: |
| |
| self.im_end_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>") |
| if self.im_end_token_id is None: |
| raise ValueError("Could not convert <|im_end|> token to ID.") |
| except Exception as e: |
| print(f"Warning: Could not get <|im_end|> ID, trying fallback: {e}") |
| self.im_end_token_id = self.tokenizer.eos_token_id |
| print(f"Using im_end_id: {self.im_end_token_id}") |
|
|
| |
| |
| if self.tokenizer.pad_token is None: |
| self.tokenizer.pad_token = self.tokenizer.eos_token |
| self.tokenizer.pad_token_id = self.tokenizer.eos_token_id |
| |
| if self.tokenizer.chat_template is None: |
| print("Warning: Qwen chat template not found. Using default template logic.") |
| |
| print(f"Loading model from {self.model_args.model_name_or_path}") |
| self.model = AutoModelForCausalLM.from_pretrained( |
| self.model_args.model_name_or_path, |
| torch_dtype=torch.bfloat16, |
| trust_remote_code=True, |
| use_cache=False, |
| low_cpu_mem_usage=True |
| ) |
| |
| |
| print("Preparing model for LoRA training...") |
| if self.model_args.use_lora: |
| |
| print("Applying LoRA configuration") |
| lora_config = LoraConfig( |
| task_type=TaskType.CAUSAL_LM, |
| r=self.model_args.lora_r, |
| lora_alpha=self.model_args.lora_alpha, |
| lora_dropout=self.model_args.lora_dropout, |
| target_modules=self.model_args.lora_target_modules, |
| bias="none", |
| inference_mode=False, |
| ) |
| |
| self.model = get_peft_model(self.model, lora_config) |
| self.model.print_trainable_parameters() |
| self.model.train() |
| |
| |
| trainable = sum(p.numel() for p in self.model.parameters() if p.requires_grad) |
| print(f"✓ Trainable parameters: {trainable:,}") |
| |
| def load_and_preprocess_data(self): |
| """加载和预处理数据""" |
| print("Loading datasets...") |
| |
| data_files = { |
| 'train': str(Path(self.data_args.data_dir) / 'train.jsonl'), |
| 'validation': str(Path(self.data_args.data_dir) / 'val.jsonl'), |
| } |
| |
| raw_datasets = load_dataset('json', data_files=data_files) |
| |
| print("Preprocessing datasets...") |
| self.train_dataset = raw_datasets['train'].map( |
| self._preprocess_function, |
| batched=True, |
| num_proc=self.data_args.preprocessing_num_workers, |
| remove_columns=raw_datasets['train'].column_names, |
| desc="Preprocessing train dataset" |
| ) |
| |
| self.eval_dataset = raw_datasets['validation'].map( |
| self._preprocess_function, |
| batched=True, |
| num_proc=self.data_args.preprocessing_num_workers, |
| remove_columns=raw_datasets['validation'].column_names, |
| desc="Preprocessing validation dataset" |
| ) |
| |
| |
| print("Filtering samples...") |
| self.train_dataset = self.train_dataset.filter( |
| lambda x: x is not None and len(x['input_ids']) <= self.data_args.max_length |
| ) |
| self.eval_dataset = self.eval_dataset.filter( |
| lambda x: x is not None and len(x['input_ids']) <= self.data_args.max_length |
| ) |
| |
| print(f"✓ Train samples: {len(self.train_dataset)}") |
| print(f"✓ Validation samples: {len(self.eval_dataset)}") |
| |
| |
| if len(self.train_dataset) > 0: |
| self._inspect_sample(self.train_dataset[0]) |
| |
| def _preprocess_function(self, examples): |
| """预处理函数 - 最终、最鲁棒修正版标签掩码""" |
| model_inputs = { |
| "input_ids": [], |
| "attention_mask": [], |
| "labels": [] |
| } |
| |
| for conversations in examples['conversations']: |
| try: |
| |
| full_text = self.tokenizer.apply_chat_template( |
| conversations, |
| tokenize=False, |
| add_generation_prompt=False |
| ) |
| |
| |
| last_assistant_index = next((i for i, msg in reversed(list(enumerate(conversations))) if msg['role'] == 'assistant'), -1) |
| |
| if last_assistant_index == -1: |
| print("Warning: Skipping conversation with no assistant reply.") |
| continue |
| |
| |
| prompt_messages = conversations[:last_assistant_index] |
| |
| prompt_messages.append({"role": "assistant", "content": ""}) |
| |
| prompt_text = self.tokenizer.apply_chat_template( |
| prompt_messages, |
| tokenize=False, |
| add_generation_prompt=False |
| ) |
| |
| |
| tokenized_full = self.tokenizer( |
| full_text, |
| max_length=self.data_args.max_length, |
| truncation=True, |
| padding=False, |
| ) |
| |
| |
| tokenized_prompt = self.tokenizer( |
| prompt_text, |
| max_length=self.data_args.max_length, |
| truncation=True, |
| padding=False, |
| ) |
| |
| input_ids = tokenized_full['input_ids'] |
| labels = input_ids.copy() |
| |
| |
| answer_start_index = len(tokenized_prompt['input_ids']) |
| |
| if answer_start_index >= len(labels): |
| print(f"Warning: Answer start index {answer_start_index} exceeds or matches total length {len(labels)}. Skipping.") |
| return None |
|
|
| |
| |
| labels[:answer_start_index] = [-100] * answer_start_index |
| |
| |
| if len(labels) > 0: |
| last_token_id = labels[-1] |
| |
| |
| if last_token_id != -100 and last_token_id == self.tokenizer.eos_token_id: |
| labels[-1] = -100 |
| |
| |
| if self.im_end_token_id is not None and last_token_id != -100 and last_token_id == self.im_end_token_id: |
| labels[-1] = -100 |
| |
| model_inputs["input_ids"].append(input_ids) |
| model_inputs["attention_mask"].append(tokenized_full['attention_mask']) |
| model_inputs["labels"].append(labels) |
| |
| except Exception as e: |
| import sys |
| import traceback |
| traceback.print_exc(file=sys.stdout) |
| print(f"Error processing conversation: {e}") |
| return None |
| |
| return model_inputs |
| |
| |
| def _inspect_sample(self, sample): |
| """检查样本质量""" |
| print("\n" + "="*60) |
| print("🔍 Sample Inspection (AFTER FINAL, MOST ROBUST FIXES)") |
| print("="*60) |
| |
| input_ids = sample['input_ids'] |
| labels = sample['labels'] |
| |
| |
| input_text = self.tokenizer.decode(input_ids, skip_special_tokens=False) |
| |
| |
| total_tokens = len(input_ids) |
| masked_tokens = sum(1 for l in labels if l == -100) |
| learning_tokens = total_tokens - masked_tokens |
| |
| print(f"Total tokens: {total_tokens}") |
| print(f"Masked tokens (prompt/padding): {masked_tokens} ({masked_tokens/total_tokens*100:.1f}%)") |
| print(f"Learning tokens (assistant): {learning_tokens} ({learning_tokens/total_tokens*100:.1f}%)") |
| |
| |
| print("\n📊 First 200 tokens masking pattern:") |
| preview_len = min(200, len(labels)) |
| mask_preview = ''.join(['█' if labels[i] == -100 else '░' for i in range(preview_len)]) |
| |
| |
| first_learn_idx = next((i for i, l in enumerate(labels) if l != -100), -1) |
| |
| if first_learn_idx != -1: |
| print(f"First 10 tokens: {self.tokenizer.decode(input_ids[:10], skip_special_tokens=False)}") |
| print(f"First learning token index: {first_learn_idx}") |
| print(f"First learning token: {self.tokenizer.decode(input_ids[first_learn_idx])}") |
| |
| start = max(0, first_learn_idx - 5) |
| end = min(len(input_ids), first_learn_idx + 5) |
| print(f"Around learning start: {self.tokenizer.decode(input_ids[start:end], skip_special_tokens=False)}") |
|
|
| print(mask_preview) |
| print("█ = masked (prompt/padding) | ░ = learning (assistant)") |
| |
| |
| learning_ids = [input_ids[i] for i in range(len(labels)) if labels[i] != -100] |
| if learning_ids: |
| learning_text = self.tokenizer.decode(learning_ids[:100], skip_special_tokens=True) |
| print(f"\n📝 Learning content preview:") |
| print(f"{learning_text[:200]}...") |
| |
| print("="*60 + "\n") |
| |
| def train(self): |
| """训练模型""" |
| print("Setting up training arguments...") |
| |
| |
| training_args = TrainingArguments( |
| output_dir=str(self.output_dir), |
| num_train_epochs=self.config['training']['num_epochs'], |
| |
| |
| per_device_train_batch_size=2, |
| per_device_eval_batch_size=2, |
| gradient_accumulation_steps=8, |
| |
| |
| learning_rate=float(self.config['training']['learning_rate']), |
| warmup_ratio=float(self.config['training']['warmup_ratio']), |
| lr_scheduler_type="cosine", |
| |
| |
| optim="adamw_torch", |
| weight_decay=float(self.config['training']['weight_decay']), |
| max_grad_norm=float(self.config['training']['max_grad_norm']), |
| |
| |
| logging_steps=10, |
| save_steps=100, |
| eval_steps=100, |
| save_total_limit=3, |
| |
| |
| eval_strategy="steps", |
| save_strategy="steps", |
| load_best_model_at_end=True, |
| metric_for_best_model="eval_loss", |
| greater_is_better=False, |
| |
| |
| bf16=True, |
| bf16_full_eval=True, |
| |
| |
| deepspeed="../config/deepspeed_zero3.json", |
| |
| |
| report_to=["tensorboard"], |
| logging_dir=str(self.output_dir / "logs"), |
| remove_unused_columns=False, |
| dataloader_pin_memory=True, |
| dataloader_num_workers=0, |
| logging_first_step=True, |
| logging_nan_inf_filter=True, |
| ) |
| |
| |
| data_collator = DataCollatorForSeq2Seq( |
| tokenizer=self.tokenizer, |
| model=self.model, |
| label_pad_token_id=-100, |
| padding=True, |
| ) |
| |
| |
| callbacks = [SampleInspectionCallback(self.tokenizer)] |
| |
| |
| trainer = Trainer( |
| model=self.model, |
| args=training_args, |
| train_dataset=self.train_dataset, |
| eval_dataset=self.eval_dataset, |
| data_collator=data_collator, |
| tokenizer=self.tokenizer, |
| callbacks=callbacks, |
| ) |
| |
| |
| print("\n" + "="*60) |
| print("Pre-training Validation") |
| print("="*60) |
| print(f"✓ Model in training mode: {self.model.training}") |
| |
| lora_params = sum(p.numel() for n, p in self.model.named_parameters() |
| if p.requires_grad and 'lora' in n.lower()) |
| print(f"✓ LoRA parameters: {lora_params:,}") |
| |
| |
| print("\n" + "="*60) |
| print("Starting Training") |
| print("="*60) |
| |
| train_result = trainer.train() |
| |
| |
| print("\nSaving model...") |
| trainer.save_model(str(self.output_dir / "final_model")) |
| |
| |
| metrics = train_result.metrics |
| trainer.log_metrics("train", metrics) |
| trainer.save_metrics("train", metrics) |
| |
| |
| print("\nEvaluating...") |
| eval_metrics = trainer.evaluate() |
| trainer.log_metrics("eval", eval_metrics) |
| trainer.save_metrics("eval", eval_metrics) |
| |
| print("\n✓ Training completed!") |
| return trainer |
|
|
|
|
| def main(): |
| """主函数""" |
| if 'CUDA_VISIBLE_DEVICES' not in os.environ: |
| os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' |
| if 'TOKENIZERS_PARALLELISM' not in os.environ: |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' |
| if 'PYTORCH_CUDA_ALLOC_CONF' not in os.environ: |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' |
| |
| print("="*60) |
| print("Qwen3-8B Fine-tuning - Fixed Version (Label Masking/LoRA Params Improved)") |
| print("="*60) |
| print() |
| |
| finetuner = QwenFineTunerFixed() |
| finetuner.load_tokenizer_and_model() |
| finetuner.load_and_preprocess_data() |
| trainer = finetuner.train() |
| |
| print("\n" + "="*60) |
| print("✓ Fine-tuning Complete!") |
| print(f"Model saved to: {finetuner.output_dir}") |
| print("="*60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|