sandbox-5ca717e4 / code_llm_train.py
Justin-lee's picture
Add Code LLM training script
28bf24b verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Code LLM - QLoRA Fine-tuning Script
====================================
Base Model: Qwen/Qwen2.5-Coder-3B
Method: QLoRA SFT (4-bit NF4 + LoRA r=64)
Datasets: Code-Feedback (66K) + Magicoder-OSS (75K) + Evol-CodeAlpaca (110K) = ~250K
Hardware: RTX 3070 (8GB VRAM) or any GPU >= 8GB
Training time: ~6-8 hours (3 epochs)
Usage:
pip install -r requirements_code.txt
python code_llm_train.py
"""
import os
import sys
import torch
from datetime import datetime
# ============================================================
# CONFIGURATION - ่ซ‹ไฟฎๆ”น้€™่ฃก
# ============================================================
MODEL_NAME = "Qwen/Qwen2.5-Coder-3B"
HF_USERNAME = "YOUR_HF_USERNAME" # ๆ”นๆˆไฝ ็š„ HuggingFace ็”จๆˆถๅ
# ่จ“็ทด่ถ…ๅƒๆ•ธ (RTX 3070 8GB ๅ„ชๅŒ–)
TRAINING_CONFIG = {
"learning_rate": 2e-4,
"num_epochs": 3,
"batch_size": 1,
"gradient_accumulation": 16,
"max_seq_length": 2048,
"lora_r": 64,
"lora_alpha": 128,
"lora_dropout": 0.05,
"warmup_ratio": 0.05,
}
OUTPUT_DIR = f"{HF_USERNAME}/code-qwen2.5-coder-3b"
# ============================================================
def print_banner(text):
print(f"\n{'='*60}")
print(f" {text}")
print(f"{'='*60}")
def check_environment():
print_banner("ENVIRONMENT CHECK")
if torch.cuda.is_available():
gpu_name = torch.cuda.get_device_name(0)
vram = torch.cuda.get_device_properties(0).total_mem / 1024**3
print(f"โœ… GPU: {gpu_name}")
print(f" VRAM: {vram:.1f} GB")
if vram < 7:
print("โš ๏ธ VRAM < 8GB, ๅฏ่ƒฝๆœƒ OOM๏ผŒๅปบ่ญฐ้™ไฝŽ max_seq_length ๅˆฐ 1024")
else:
print("โŒ ๆฒ’ๆœ‰ๅตๆธฌๅˆฐ GPU๏ผๆญค่…ณๆœฌ้œ€่ฆ NVIDIA GPU")
sys.exit(1)
required = ["transformers", "trl", "peft", "bitsandbytes", "accelerate", "datasets"]
missing = []
for pkg in required:
try:
__import__(pkg)
print(f"โœ… {pkg}")
except ImportError:
missing.append(pkg)
print(f"โŒ {pkg}")
if missing:
print(f"\n่ซ‹้‹่กŒ: pip install {' '.join(missing)}")
sys.exit(1)
def load_datasets():
from datasets import load_dataset, concatenate_datasets
print_banner("LOADING DATASETS")
print("๐Ÿ“ฆ [1/3] Code-Feedback (66K multi-turn coding chat)...")
code_feedback = load_dataset("m-a-p/Code-Feedback", split="train")
cf_msgs = code_feedback.map(
lambda x: {"messages": x["messages"]},
remove_columns=[c for c in code_feedback.column_names if c != "messages"],
)
print(f" โœ… {len(cf_msgs)} samples loaded")
print("๐Ÿ“ฆ [2/3] Magicoder-OSS-Instruct (75K real GitHub seeds)...")
magicoder = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")
def convert_magicoder(example):
return {"messages": [
{"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."},
{"role": "user", "content": example["problem"]},
{"role": "assistant", "content": example["solution"]},
]}
mc_msgs = magicoder.map(convert_magicoder, remove_columns=magicoder.column_names)
print(f" โœ… {len(mc_msgs)} samples converted")
print("๐Ÿ“ฆ [3/3] Evol-CodeAlpaca (110K complexity-evolved)...")
evol = load_dataset("theblackcat102/evol-codealpaca-v1", split="train")
def convert_evol(example):
return {"messages": [
{"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."},
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["output"]},
]}
evol_msgs = evol.map(convert_evol, remove_columns=evol.column_names)
print(f" โœ… {len(evol_msgs)} samples converted")
print("\n๐Ÿ”„ ๅˆไฝตๆ•ธๆ“š้›†...")
combined = concatenate_datasets([cf_msgs, mc_msgs, evol_msgs]).shuffle(seed=42)
split = combined.train_test_split(test_size=0.02, seed=42)
train_ds, eval_ds = split["train"], split["test"]
print(f"\n๐Ÿ“Š ๆ•ธๆ“š้›†็ตฑ่จˆ:")
print(f" Code-Feedback: {len(cf_msgs):>7,} samples")
print(f" Magicoder-OSS: {len(mc_msgs):>7,} samples")
print(f" Evol-CodeAlpaca:{len(evol_msgs):>7,} samples")
print(f" {'โ”€'*35}")
print(f" ็ธฝ่จˆ่จ“็ทด: {len(train_ds):>7,} samples")
print(f" ็ธฝ่จˆ้ฉ—่ญ‰: {len(eval_ds):>7,} samples")
return train_ds, eval_ds
def setup_model():
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
print_banner("LOADING MODEL")
print(f"๐Ÿค– Model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print(f" Vocab: {len(tokenizer):,} tokens")
print("\nโšก ้…็ฝฎ QLoRA (4-bit NF4 + double quant)...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
)
print("๐Ÿ“ฅ ่ผ‰ๅ…ฅๆจกๅž‹...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)
print("โœ… ๆจกๅž‹ๆบ–ๅ‚™ๅฎŒๆˆ")
print(f"\n๐Ÿ”ง ้…็ฝฎ LoRA (r={TRAINING_CONFIG['lora_r']}, alpha={TRAINING_CONFIG['lora_alpha']})...")
lora_config = LoraConfig(
r=TRAINING_CONFIG["lora_r"], lora_alpha=TRAINING_CONFIG["lora_alpha"],
lora_dropout=TRAINING_CONFIG["lora_dropout"], bias="none", task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
modules_to_save=["lm_head", "embed_tokens"],
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
return model, tokenizer, lora_config
def create_trainer(model, tokenizer, train_ds, eval_ds, lora_config):
from trl import SFTTrainer, SFTConfig
print_banner("CONFIGURING TRAINER")
run_name = f"code-qwen-{datetime.now().strftime('%m%d-%H%M')}"
report_to = []
try:
import trackio
trackio.init(project="code-llm", experiment="qlora-sft", run_name=run_name)
report_to = ["trackio"]
print("โœ… Trackio ็›ฃๆŽงๅทฒๅ•Ÿๅ‹•")
except Exception:
print("โš ๏ธ Trackio ไธๅฏ็”จ๏ผŒไฝฟ็”จ tensorboard")
report_to = ["tensorboard"]
training_args = SFTConfig(
learning_rate=TRAINING_CONFIG["learning_rate"], lr_scheduler_type="cosine",
warmup_ratio=TRAINING_CONFIG["warmup_ratio"],
num_train_epochs=TRAINING_CONFIG["num_epochs"],
per_device_train_batch_size=TRAINING_CONFIG["batch_size"],
gradient_accumulation_steps=TRAINING_CONFIG["gradient_accumulation"],
max_seq_length=TRAINING_CONFIG["max_seq_length"],
gradient_checkpointing=True, bf16=True, fp16=False,
optim="paged_adamw_8bit", packing=True,
output_dir="./output_code", logging_steps=10, save_steps=1000, save_total_limit=2,
eval_strategy="steps", eval_steps=1000,
push_to_hub=True, hub_model_id=OUTPUT_DIR, hub_strategy="checkpoint",
report_to=report_to, logging_strategy="steps", logging_first_step=True,
remove_unused_columns=False, dataloader_num_workers=4, seed=42,
)
trainer = SFTTrainer(
model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds,
processing_class=tokenizer, peft_config=lora_config,
)
total_steps = len(train_ds) // (TRAINING_CONFIG["batch_size"] * TRAINING_CONFIG["gradient_accumulation"]) * TRAINING_CONFIG["num_epochs"]
print(f"\n๐Ÿ“‹ ่จ“็ทด่จˆๅŠƒ:")
print(f" ๆ•ธๆ“š้‡: {len(train_ds):,} samples")
print(f" Batch size: {TRAINING_CONFIG['batch_size']} ร— {TRAINING_CONFIG['gradient_accumulation']} = {TRAINING_CONFIG['batch_size'] * TRAINING_CONFIG['gradient_accumulation']}")
print(f" Epochs: {TRAINING_CONFIG['num_epochs']}")
print(f" ้ ไผฐๆญฅๆ•ธ: ~{total_steps:,} steps")
print(f" Packing: โœ… ๅ•Ÿ็”จ")
print(f" Optimizer: paged_adamw_8bit")
print(f" ่ผธๅ‡บไฝ็ฝฎ: https://huggingface.co/{OUTPUT_DIR}")
return trainer, run_name
def train(trainer):
print_banner("TRAINING")
print("๐Ÿš€ ้–‹ๅง‹่จ“็ทด...\n ๆŒ‰ Ctrl+C ๅฏ้šจๆ™‚ไธญๆ–ทไธฆไฟๅญ˜\n")
try:
result = trainer.train()
print(f"\nโœ… ่จ“็ทดๅฎŒๆˆ๏ผ Steps: {result.global_step}, Loss: {result.training_loss:.4f}")
return True
except KeyboardInterrupt:
print("\nโš ๏ธ ่จ“็ทด่ขซไธญๆ–ท๏ผŒๆญฃๅœจไฟๅญ˜...")
trainer.save_model()
return True
except Exception as e:
print(f"\nโŒ ่จ“็ทดๅคฑๆ•—: {e}")
raise
def save_and_push(trainer):
print_banner("SAVING & UPLOADING")
try:
print("๐Ÿ“ค ไธŠๅ‚ณๆจกๅž‹ๅˆฐ HuggingFace Hub...")
trainer.push_to_hub()
print(f"\nโœ… ๆจกๅž‹ๅทฒไธŠๅ‚ณ!\n๐Ÿ”— https://huggingface.co/{OUTPUT_DIR}")
except Exception as e:
print(f"โš ๏ธ ไธŠๅ‚ณๅคฑๆ•—: {e}\n ๆจกๅž‹ๅทฒไฟๅญ˜ๅœจ ./output_code ็›ฎ้Œ„")
def main():
print("""
โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
โ•‘ Code LLM - QLoRA Fine-tuning โ•‘
โ•‘ Base: Qwen2.5-Coder-3B โ•‘
โ•‘ Data: 250K code samples (3 datasets) โ•‘
โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
""")
check_environment()
train_ds, eval_ds = load_datasets()
model, tokenizer, lora_config = setup_model()
trainer, run_name = create_trainer(model, tokenizer, train_ds, eval_ds, lora_config)
success = train(trainer)
if success:
save_and_push(trainer)
print_banner("DONE")
print(f" Run: {run_name}\n Model: https://huggingface.co/{OUTPUT_DIR}")
if __name__ == "__main__":
main()