import os import gc import torch import torch.distributed as dist from huggingface_hub import login from kaggle_secrets import UserSecretsClient from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig ) from datasets import load_dataset from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training # 1. 环境与显存配置 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" try: user_secrets = UserSecretsClient() hf_token = user_secrets.get_secret("HF_TOKEN") if hf_token: login(token=hf_token) except: hf_token = None local_rank = int(os.environ.get("LOCAL_RANK", 0)) torch.cuda.set_device(local_rank) if not dist.is_initialized(): dist.init_process_group(backend="nccl") model_id = "Qwen/Qwen3.5-9B" dataset_id = "a686d380/h-corpus-2023" # 2. 极致量化配置 bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16 ) # 3. 加载模型 (核心修复点) if local_rank == 0: print(f"正在加载模型核心: {model_id}...") model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map={"": local_rank}, trust_remote_code=True, token=hf_token, torch_dtype=torch.float16, low_cpu_mem_usage=True, attn_implementation="sdpa" ) # 4. 架构裁剪 (移除最后 12 层以确保 PEFT 转换时不崩) if hasattr(model, "model") and hasattr(model.model, "layers"): model.model.layers = model.model.layers[:-12] model.config.num_hidden_layers = len(model.model.layers) if local_rank == 0: print(f"裁剪成功。当前层数: {len(model.model.layers)}") gc.collect() torch.cuda.empty_cache() # 5. QLoRA 准备 model = prepare_model_for_kbit_training(model) lora_config = LoraConfig( r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) # 6. 数据预处理 (流式加速版) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=hf_token) tokenizer.pad_token = tokenizer.eos_token if local_rank == 0: print("正在以流式模式连接数据集 (无需等待下载)...") # 开启 streaming=True,这步是瞬间完成的 raw_dataset = load_dataset(dataset_id, split="train", token=hf_token, streaming=True) def tokenize_fn(x): text_col = "text" if "text" in x else list(x.keys())[0] tokenized = tokenizer(x[text_col], truncation=True, max_length=512, padding="max_length") return {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]} # 流式数据集的 map 也是惰性的,几乎不耗时 tokenized_ds = raw_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) # 7. 训练参数 training_args = TrainingArguments( output_dir="./qwen_stream_out", per_device_train_batch_size=1, gradient_accumulation_steps=16, learning_rate=2e-4, fp16=True, gradient_checkpointing=True, logging_steps=5, max_steps=200, # 流式模式下建议手动指定 max_steps save_total_limit=1, ddp_find_unused_parameters=False, report_to="none", # 必须指定这个,因为流式数据集无法通过 len() 获取长度 max_grad_norm=1.0, gradient_checkpointing_kwargs={"use_reentrant": False} ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_ds, data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) ) model.config.use_cache = False if local_rank == 0: print("数据流已就绪,开始训练...") trainer.train() # 8. 保存 if local_rank == 0: trainer.model.save_pretrained("./qwen_final_lora") print("训练成功结束!")