sandbox-5ca717e4

Sleeping

App Files Files Community

Justin-lee commited on 16 days ago

Commit

28bf24b

verified ·

1 Parent(s): f4a8336

Add Code LLM training script

Browse files

Files changed (1) hide show

code_llm_train.py +261 -0

code_llm_train.py ADDED Viewed

	@@ -0,0 +1,261 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Code LLM - QLoRA Fine-tuning Script
+====================================
+Base Model: Qwen/Qwen2.5-Coder-3B
+Method: QLoRA SFT (4-bit NF4 + LoRA r=64)
+Datasets: Code-Feedback (66K) + Magicoder-OSS (75K) + Evol-CodeAlpaca (110K) = ~250K
+Hardware: RTX 3070 (8GB VRAM) or any GPU >= 8GB
+Training time: ~6-8 hours (3 epochs)
+Usage:
+    pip install -r requirements_code.txt
+    python code_llm_train.py
+"""
+import os
+import sys
+import torch
+from datetime import datetime
+# ============================================================
+#  CONFIGURATION - 請修改這裡
+# ============================================================
+MODEL_NAME = "Qwen/Qwen2.5-Coder-3B"
+HF_USERNAME = "YOUR_HF_USERNAME"       # 改成你的 HuggingFace 用戶名
+# 訓練超參數 (RTX 3070 8GB 優化)
+TRAINING_CONFIG = {
+    "learning_rate": 2e-4,
+    "num_epochs": 3,
+    "batch_size": 1,
+    "gradient_accumulation": 16,
+    "max_seq_length": 2048,
+    "lora_r": 64,
+    "lora_alpha": 128,
+    "lora_dropout": 0.05,
+    "warmup_ratio": 0.05,
+}
+OUTPUT_DIR = f"{HF_USERNAME}/code-qwen2.5-coder-3b"
+# ============================================================
+def print_banner(text):
+    print(f"\n{'='*60}")
+    print(f"  {text}")
+    print(f"{'='*60}")
+def check_environment():
+    print_banner("ENVIRONMENT CHECK")
+    if torch.cuda.is_available():
+        gpu_name = torch.cuda.get_device_name(0)
+        vram = torch.cuda.get_device_properties(0).total_mem / 1024**3
+        print(f"✅ GPU: {gpu_name}")
+        print(f"   VRAM: {vram:.1f} GB")
+        if vram < 7:
+            print("⚠️  VRAM < 8GB, 可能會 OOM，建議降低 max_seq_length 到 1024")
+    else:
+        print("❌ 沒有偵測到 GPU！此腳本需要 NVIDIA GPU")
+        sys.exit(1)
+    required = ["transformers", "trl", "peft", "bitsandbytes", "accelerate", "datasets"]
+    missing = []
+    for pkg in required:
+        try:
+            __import__(pkg)
+            print(f"✅ {pkg}")
+        except ImportError:
+            missing.append(pkg)
+            print(f"❌ {pkg}")
+    if missing:
+        print(f"\n請運行: pip install {' '.join(missing)}")
+        sys.exit(1)
+def load_datasets():
+    from datasets import load_dataset, concatenate_datasets
+    print_banner("LOADING DATASETS")
+    print("📦 [1/3] Code-Feedback (66K multi-turn coding chat)...")
+    code_feedback = load_dataset("m-a-p/Code-Feedback", split="train")
+    cf_msgs = code_feedback.map(
+        lambda x: {"messages": x["messages"]},
+        remove_columns=[c for c in code_feedback.column_names if c != "messages"],
+    )
+    print(f"   ✅ {len(cf_msgs)} samples loaded")
+    print("📦 [2/3] Magicoder-OSS-Instruct (75K real GitHub seeds)...")
+    magicoder = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")
+    def convert_magicoder(example):
+        return {"messages": [
+            {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."},
+            {"role": "user", "content": example["problem"]},
+            {"role": "assistant", "content": example["solution"]},
+        ]}
+    mc_msgs = magicoder.map(convert_magicoder, remove_columns=magicoder.column_names)
+    print(f"   ✅ {len(mc_msgs)} samples converted")
+    print("📦 [3/3] Evol-CodeAlpaca (110K complexity-evolved)...")
+    evol = load_dataset("theblackcat102/evol-codealpaca-v1", split="train")
+    def convert_evol(example):
+        return {"messages": [
+            {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."},
+            {"role": "user", "content": example["instruction"]},
+            {"role": "assistant", "content": example["output"]},
+        ]}
+    evol_msgs = evol.map(convert_evol, remove_columns=evol.column_names)
+    print(f"   ✅ {len(evol_msgs)} samples converted")
+    print("\n🔄 合併數據集...")
+    combined = concatenate_datasets([cf_msgs, mc_msgs, evol_msgs]).shuffle(seed=42)
+    split = combined.train_test_split(test_size=0.02, seed=42)
+    train_ds, eval_ds = split["train"], split["test"]
+    print(f"\n📊 數據集統計:")
+    print(f"   Code-Feedback:  {len(cf_msgs):>7,} samples")
+    print(f"   Magicoder-OSS:  {len(mc_msgs):>7,} samples")
+    print(f"   Evol-CodeAlpaca:{len(evol_msgs):>7,} samples")
+    print(f"   {'─'*35}")
+    print(f"   總計訓練:       {len(train_ds):>7,} samples")
+    print(f"   總計驗證:       {len(eval_ds):>7,} samples")
+    return train_ds, eval_ds
+def setup_model():
+    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+    from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
+    print_banner("LOADING MODEL")
+    print(f"🤖 Model: {MODEL_NAME}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    print(f"   Vocab: {len(tokenizer):,} tokens")
+    print("\n⚡ 配置 QLoRA (4-bit NF4 + double quant)...")
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True, bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
+    )
+    print("📥 載入模型...")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True,
+    )
+    model = prepare_model_for_kbit_training(model)
+    print("✅ 模型準備完成")
+    print(f"\n🔧 配置 LoRA (r={TRAINING_CONFIG['lora_r']}, alpha={TRAINING_CONFIG['lora_alpha']})...")
+    lora_config = LoraConfig(
+        r=TRAINING_CONFIG["lora_r"], lora_alpha=TRAINING_CONFIG["lora_alpha"],
+        lora_dropout=TRAINING_CONFIG["lora_dropout"], bias="none", task_type="CAUSAL_LM",
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        modules_to_save=["lm_head", "embed_tokens"],
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    return model, tokenizer, lora_config
+def create_trainer(model, tokenizer, train_ds, eval_ds, lora_config):
+    from trl import SFTTrainer, SFTConfig
+    print_banner("CONFIGURING TRAINER")
+    run_name = f"code-qwen-{datetime.now().strftime('%m%d-%H%M')}"
+    report_to = []
+    try:
+        import trackio
+        trackio.init(project="code-llm", experiment="qlora-sft", run_name=run_name)
+        report_to = ["trackio"]
+        print("✅ Trackio 監控已啟動")
+    except Exception:
+        print("⚠️  Trackio 不可用，使用 tensorboard")
+        report_to = ["tensorboard"]
+    training_args = SFTConfig(
+        learning_rate=TRAINING_CONFIG["learning_rate"], lr_scheduler_type="cosine",
+        warmup_ratio=TRAINING_CONFIG["warmup_ratio"],
+        num_train_epochs=TRAINING_CONFIG["num_epochs"],
+        per_device_train_batch_size=TRAINING_CONFIG["batch_size"],
+        gradient_accumulation_steps=TRAINING_CONFIG["gradient_accumulation"],
+        max_seq_length=TRAINING_CONFIG["max_seq_length"],
+        gradient_checkpointing=True, bf16=True, fp16=False,
+        optim="paged_adamw_8bit", packing=True,
+        output_dir="./output_code", logging_steps=10, save_steps=1000, save_total_limit=2,
+        eval_strategy="steps", eval_steps=1000,
+        push_to_hub=True, hub_model_id=OUTPUT_DIR, hub_strategy="checkpoint",
+        report_to=report_to, logging_strategy="steps", logging_first_step=True,
+        remove_unused_columns=False, dataloader_num_workers=4, seed=42,
+    )
+    trainer = SFTTrainer(
+        model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds,
+        processing_class=tokenizer, peft_config=lora_config,
+    )
+    total_steps = len(train_ds) // (TRAINING_CONFIG["batch_size"] * TRAINING_CONFIG["gradient_accumulation"]) * TRAINING_CONFIG["num_epochs"]
+    print(f"\n📋 訓練計劃:")
+    print(f"   數據量:     {len(train_ds):,} samples")
+    print(f"   Batch size: {TRAINING_CONFIG['batch_size']} × {TRAINING_CONFIG['gradient_accumulation']} = {TRAINING_CONFIG['batch_size'] * TRAINING_CONFIG['gradient_accumulation']}")
+    print(f"   Epochs:     {TRAINING_CONFIG['num_epochs']}")
+    print(f"   預估步數:   ~{total_steps:,} steps")
+    print(f"   Packing:    ✅ 啟用")
+    print(f"   Optimizer:  paged_adamw_8bit")
+    print(f"   輸出位置:   https://huggingface.co/{OUTPUT_DIR}")
+    return trainer, run_name
+def train(trainer):
+    print_banner("TRAINING")
+    print("🚀 開始訓練...\n   按 Ctrl+C 可隨時中斷並保存\n")
+    try:
+        result = trainer.train()
+        print(f"\n✅ 訓練完成！ Steps: {result.global_step}, Loss: {result.training_loss:.4f}")
+        return True
+    except KeyboardInterrupt:
+        print("\n⚠️  訓練被中斷，正在保存...")
+        trainer.save_model()
+        return True
+    except Exception as e:
+        print(f"\n❌ 訓練失敗: {e}")
+        raise
+def save_and_push(trainer):
+    print_banner("SAVING & UPLOADING")
+    try:
+        print("📤 上傳模型到 HuggingFace Hub...")
+        trainer.push_to_hub()
+        print(f"\n✅ 模型已上傳!\n🔗 https://huggingface.co/{OUTPUT_DIR}")
+    except Exception as e:
+        print(f"⚠️  上傳失敗: {e}\n   模型已保存在 ./output_code 目錄")
+def main():
+    print("""
+    ╔════════════════════════════════════════════════════════════╗
+    ║          Code LLM - QLoRA Fine-tuning                     ║
+    ║          Base: Qwen2.5-Coder-3B                           ║
+    ║          Data: 250K code samples (3 datasets)             ║
+    ╚══════════════════════════════════��═════════════════════════╝
+    """)
+    check_environment()
+    train_ds, eval_ds = load_datasets()
+    model, tokenizer, lora_config = setup_model()
+    trainer, run_name = create_trainer(model, tokenizer, train_ds, eval_ds, lora_config)
+    success = train(trainer)
+    if success:
+        save_and_push(trainer)
+    print_banner("DONE")
+    print(f"  Run: {run_name}\n  Model: https://huggingface.co/{OUTPUT_DIR}")
+if __name__ == "__main__":
+    main()