#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Code LLM - QLoRA Fine-tuning Script ==================================== Base Model: Qwen/Qwen2.5-Coder-3B Method: QLoRA SFT (4-bit NF4 + LoRA r=64) Datasets: Code-Feedback (66K) + Magicoder-OSS (75K) + Evol-CodeAlpaca (110K) = ~250K Hardware: RTX 3070 (8GB VRAM) or any GPU >= 8GB Training time: ~6-8 hours (3 epochs) Usage: pip install -r requirements_code.txt python code_llm_train.py """ import os import sys import torch from datetime import datetime # ============================================================ # CONFIGURATION - 請修改這裡 # ============================================================ MODEL_NAME = "Qwen/Qwen2.5-Coder-3B" HF_USERNAME = "YOUR_HF_USERNAME" # 改成你的 HuggingFace 用戶名 # 訓練超參數 (RTX 3070 8GB 優化) TRAINING_CONFIG = { "learning_rate": 2e-4, "num_epochs": 3, "batch_size": 1, "gradient_accumulation": 16, "max_seq_length": 2048, "lora_r": 64, "lora_alpha": 128, "lora_dropout": 0.05, "warmup_ratio": 0.05, } OUTPUT_DIR = f"{HF_USERNAME}/code-qwen2.5-coder-3b" # ============================================================ def print_banner(text): print(f"\n{'='*60}") print(f" {text}") print(f"{'='*60}") def check_environment(): print_banner("ENVIRONMENT CHECK") if torch.cuda.is_available(): gpu_name = torch.cuda.get_device_name(0) vram = torch.cuda.get_device_properties(0).total_mem / 1024**3 print(f"✅ GPU: {gpu_name}") print(f" VRAM: {vram:.1f} GB") if vram < 7: print("⚠️ VRAM < 8GB, 可能會 OOM,建議降低 max_seq_length 到 1024") else: print("❌ 沒有偵測到 GPU!此腳本需要 NVIDIA GPU") sys.exit(1) required = ["transformers", "trl", "peft", "bitsandbytes", "accelerate", "datasets"] missing = [] for pkg in required: try: __import__(pkg) print(f"✅ {pkg}") except ImportError: missing.append(pkg) print(f"❌ {pkg}") if missing: print(f"\n請運行: pip install {' '.join(missing)}") sys.exit(1) def load_datasets(): from datasets import load_dataset, concatenate_datasets print_banner("LOADING DATASETS") print("📦 [1/3] Code-Feedback (66K multi-turn coding chat)...") code_feedback = load_dataset("m-a-p/Code-Feedback", split="train") cf_msgs = code_feedback.map( lambda x: {"messages": x["messages"]}, remove_columns=[c for c in code_feedback.column_names if c != "messages"], ) print(f" ✅ {len(cf_msgs)} samples loaded") print("📦 [2/3] Magicoder-OSS-Instruct (75K real GitHub seeds)...") magicoder = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train") def convert_magicoder(example): return {"messages": [ {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."}, {"role": "user", "content": example["problem"]}, {"role": "assistant", "content": example["solution"]}, ]} mc_msgs = magicoder.map(convert_magicoder, remove_columns=magicoder.column_names) print(f" ✅ {len(mc_msgs)} samples converted") print("📦 [3/3] Evol-CodeAlpaca (110K complexity-evolved)...") evol = load_dataset("theblackcat102/evol-codealpaca-v1", split="train") def convert_evol(example): return {"messages": [ {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."}, {"role": "user", "content": example["instruction"]}, {"role": "assistant", "content": example["output"]}, ]} evol_msgs = evol.map(convert_evol, remove_columns=evol.column_names) print(f" ✅ {len(evol_msgs)} samples converted") print("\n🔄 合併數據集...") combined = concatenate_datasets([cf_msgs, mc_msgs, evol_msgs]).shuffle(seed=42) split = combined.train_test_split(test_size=0.02, seed=42) train_ds, eval_ds = split["train"], split["test"] print(f"\n📊 數據集統計:") print(f" Code-Feedback: {len(cf_msgs):>7,} samples") print(f" Magicoder-OSS: {len(mc_msgs):>7,} samples") print(f" Evol-CodeAlpaca:{len(evol_msgs):>7,} samples") print(f" {'─'*35}") print(f" 總計訓練: {len(train_ds):>7,} samples") print(f" 總計驗證: {len(eval_ds):>7,} samples") return train_ds, eval_ds def setup_model(): from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model print_banner("LOADING MODEL") print(f"🤖 Model: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" print(f" Vocab: {len(tokenizer):,} tokens") print("\n⚡ 配置 QLoRA (4-bit NF4 + double quant)...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) print("📥 載入模型...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, ) model = prepare_model_for_kbit_training(model) print("✅ 模型準備完成") print(f"\n🔧 配置 LoRA (r={TRAINING_CONFIG['lora_r']}, alpha={TRAINING_CONFIG['lora_alpha']})...") lora_config = LoraConfig( r=TRAINING_CONFIG["lora_r"], lora_alpha=TRAINING_CONFIG["lora_alpha"], lora_dropout=TRAINING_CONFIG["lora_dropout"], bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], modules_to_save=["lm_head", "embed_tokens"], ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() return model, tokenizer, lora_config def create_trainer(model, tokenizer, train_ds, eval_ds, lora_config): from trl import SFTTrainer, SFTConfig print_banner("CONFIGURING TRAINER") run_name = f"code-qwen-{datetime.now().strftime('%m%d-%H%M')}" report_to = [] try: import trackio trackio.init(project="code-llm", experiment="qlora-sft", run_name=run_name) report_to = ["trackio"] print("✅ Trackio 監控已啟動") except Exception: print("⚠️ Trackio 不可用,使用 tensorboard") report_to = ["tensorboard"] training_args = SFTConfig( learning_rate=TRAINING_CONFIG["learning_rate"], lr_scheduler_type="cosine", warmup_ratio=TRAINING_CONFIG["warmup_ratio"], num_train_epochs=TRAINING_CONFIG["num_epochs"], per_device_train_batch_size=TRAINING_CONFIG["batch_size"], gradient_accumulation_steps=TRAINING_CONFIG["gradient_accumulation"], max_seq_length=TRAINING_CONFIG["max_seq_length"], gradient_checkpointing=True, bf16=True, fp16=False, optim="paged_adamw_8bit", packing=True, output_dir="./output_code", logging_steps=10, save_steps=1000, save_total_limit=2, eval_strategy="steps", eval_steps=1000, push_to_hub=True, hub_model_id=OUTPUT_DIR, hub_strategy="checkpoint", report_to=report_to, logging_strategy="steps", logging_first_step=True, remove_unused_columns=False, dataloader_num_workers=4, seed=42, ) trainer = SFTTrainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds, processing_class=tokenizer, peft_config=lora_config, ) total_steps = len(train_ds) // (TRAINING_CONFIG["batch_size"] * TRAINING_CONFIG["gradient_accumulation"]) * TRAINING_CONFIG["num_epochs"] print(f"\n📋 訓練計劃:") print(f" 數據量: {len(train_ds):,} samples") print(f" Batch size: {TRAINING_CONFIG['batch_size']} × {TRAINING_CONFIG['gradient_accumulation']} = {TRAINING_CONFIG['batch_size'] * TRAINING_CONFIG['gradient_accumulation']}") print(f" Epochs: {TRAINING_CONFIG['num_epochs']}") print(f" 預估步數: ~{total_steps:,} steps") print(f" Packing: ✅ 啟用") print(f" Optimizer: paged_adamw_8bit") print(f" 輸出位置: https://huggingface.co/{OUTPUT_DIR}") return trainer, run_name def train(trainer): print_banner("TRAINING") print("🚀 開始訓練...\n 按 Ctrl+C 可隨時中斷並保存\n") try: result = trainer.train() print(f"\n✅ 訓練完成! Steps: {result.global_step}, Loss: {result.training_loss:.4f}") return True except KeyboardInterrupt: print("\n⚠️ 訓練被中斷,正在保存...") trainer.save_model() return True except Exception as e: print(f"\n❌ 訓練失敗: {e}") raise def save_and_push(trainer): print_banner("SAVING & UPLOADING") try: print("📤 上傳模型到 HuggingFace Hub...") trainer.push_to_hub() print(f"\n✅ 模型已上傳!\n🔗 https://huggingface.co/{OUTPUT_DIR}") except Exception as e: print(f"⚠️ 上傳失敗: {e}\n 模型已保存在 ./output_code 目錄") def main(): print(""" ╔════════════════════════════════════════════════════════════╗ ║ Code LLM - QLoRA Fine-tuning ║ ║ Base: Qwen2.5-Coder-3B ║ ║ Data: 250K code samples (3 datasets) ║ ╚════════════════════════════════════════════════════════════╝ """) check_environment() train_ds, eval_ds = load_datasets() model, tokenizer, lora_config = setup_model() trainer, run_name = create_trainer(model, tokenizer, train_ds, eval_ds, lora_config) success = train(trainer) if success: save_and_push(trainer) print_banner("DONE") print(f" Run: {run_name}\n Model: https://huggingface.co/{OUTPUT_DIR}") if __name__ == "__main__": main()