Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Code LLM - QLoRA Fine-tuning Script | |
| ==================================== | |
| Base Model: Qwen/Qwen2.5-Coder-3B | |
| Method: QLoRA SFT (4-bit NF4 + LoRA r=64) | |
| Datasets: Code-Feedback (66K) + Magicoder-OSS (75K) + Evol-CodeAlpaca (110K) = ~250K | |
| Hardware: RTX 3070 (8GB VRAM) or any GPU >= 8GB | |
| Training time: ~6-8 hours (3 epochs) | |
| Usage: | |
| pip install -r requirements_code.txt | |
| python code_llm_train.py | |
| """ | |
| import os | |
| import sys | |
| import torch | |
| from datetime import datetime | |
| # ============================================================ | |
| # CONFIGURATION - ่ซไฟฎๆน้่ฃก | |
| # ============================================================ | |
| MODEL_NAME = "Qwen/Qwen2.5-Coder-3B" | |
| HF_USERNAME = "YOUR_HF_USERNAME" # ๆนๆไฝ ็ HuggingFace ็จๆถๅ | |
| # ่จ็ทด่ถ ๅๆธ (RTX 3070 8GB ๅชๅ) | |
| TRAINING_CONFIG = { | |
| "learning_rate": 2e-4, | |
| "num_epochs": 3, | |
| "batch_size": 1, | |
| "gradient_accumulation": 16, | |
| "max_seq_length": 2048, | |
| "lora_r": 64, | |
| "lora_alpha": 128, | |
| "lora_dropout": 0.05, | |
| "warmup_ratio": 0.05, | |
| } | |
| OUTPUT_DIR = f"{HF_USERNAME}/code-qwen2.5-coder-3b" | |
| # ============================================================ | |
| def print_banner(text): | |
| print(f"\n{'='*60}") | |
| print(f" {text}") | |
| print(f"{'='*60}") | |
| def check_environment(): | |
| print_banner("ENVIRONMENT CHECK") | |
| if torch.cuda.is_available(): | |
| gpu_name = torch.cuda.get_device_name(0) | |
| vram = torch.cuda.get_device_properties(0).total_mem / 1024**3 | |
| print(f"โ GPU: {gpu_name}") | |
| print(f" VRAM: {vram:.1f} GB") | |
| if vram < 7: | |
| print("โ ๏ธ VRAM < 8GB, ๅฏ่ฝๆ OOM๏ผๅปบ่ญฐ้ไฝ max_seq_length ๅฐ 1024") | |
| else: | |
| print("โ ๆฒๆๅตๆธฌๅฐ GPU๏ผๆญค่ ณๆฌ้่ฆ NVIDIA GPU") | |
| sys.exit(1) | |
| required = ["transformers", "trl", "peft", "bitsandbytes", "accelerate", "datasets"] | |
| missing = [] | |
| for pkg in required: | |
| try: | |
| __import__(pkg) | |
| print(f"โ {pkg}") | |
| except ImportError: | |
| missing.append(pkg) | |
| print(f"โ {pkg}") | |
| if missing: | |
| print(f"\n่ซ้่ก: pip install {' '.join(missing)}") | |
| sys.exit(1) | |
| def load_datasets(): | |
| from datasets import load_dataset, concatenate_datasets | |
| print_banner("LOADING DATASETS") | |
| print("๐ฆ [1/3] Code-Feedback (66K multi-turn coding chat)...") | |
| code_feedback = load_dataset("m-a-p/Code-Feedback", split="train") | |
| cf_msgs = code_feedback.map( | |
| lambda x: {"messages": x["messages"]}, | |
| remove_columns=[c for c in code_feedback.column_names if c != "messages"], | |
| ) | |
| print(f" โ {len(cf_msgs)} samples loaded") | |
| print("๐ฆ [2/3] Magicoder-OSS-Instruct (75K real GitHub seeds)...") | |
| magicoder = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train") | |
| def convert_magicoder(example): | |
| return {"messages": [ | |
| {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."}, | |
| {"role": "user", "content": example["problem"]}, | |
| {"role": "assistant", "content": example["solution"]}, | |
| ]} | |
| mc_msgs = magicoder.map(convert_magicoder, remove_columns=magicoder.column_names) | |
| print(f" โ {len(mc_msgs)} samples converted") | |
| print("๐ฆ [3/3] Evol-CodeAlpaca (110K complexity-evolved)...") | |
| evol = load_dataset("theblackcat102/evol-codealpaca-v1", split="train") | |
| def convert_evol(example): | |
| return {"messages": [ | |
| {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."}, | |
| {"role": "user", "content": example["instruction"]}, | |
| {"role": "assistant", "content": example["output"]}, | |
| ]} | |
| evol_msgs = evol.map(convert_evol, remove_columns=evol.column_names) | |
| print(f" โ {len(evol_msgs)} samples converted") | |
| print("\n๐ ๅไฝตๆธๆ้...") | |
| combined = concatenate_datasets([cf_msgs, mc_msgs, evol_msgs]).shuffle(seed=42) | |
| split = combined.train_test_split(test_size=0.02, seed=42) | |
| train_ds, eval_ds = split["train"], split["test"] | |
| print(f"\n๐ ๆธๆ้็ตฑ่จ:") | |
| print(f" Code-Feedback: {len(cf_msgs):>7,} samples") | |
| print(f" Magicoder-OSS: {len(mc_msgs):>7,} samples") | |
| print(f" Evol-CodeAlpaca:{len(evol_msgs):>7,} samples") | |
| print(f" {'โ'*35}") | |
| print(f" ็ธฝ่จ่จ็ทด: {len(train_ds):>7,} samples") | |
| print(f" ็ธฝ่จ้ฉ่ญ: {len(eval_ds):>7,} samples") | |
| return train_ds, eval_ds | |
| def setup_model(): | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model | |
| print_banner("LOADING MODEL") | |
| print(f"๐ค Model: {MODEL_NAME}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| tokenizer.padding_side = "right" | |
| print(f" Vocab: {len(tokenizer):,} tokens") | |
| print("\nโก ้ ็ฝฎ QLoRA (4-bit NF4 + double quant)...") | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, | |
| ) | |
| print("๐ฅ ่ผๅ ฅๆจกๅ...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, | |
| ) | |
| model = prepare_model_for_kbit_training(model) | |
| print("โ ๆจกๅๆบๅๅฎๆ") | |
| print(f"\n๐ง ้ ็ฝฎ LoRA (r={TRAINING_CONFIG['lora_r']}, alpha={TRAINING_CONFIG['lora_alpha']})...") | |
| lora_config = LoraConfig( | |
| r=TRAINING_CONFIG["lora_r"], lora_alpha=TRAINING_CONFIG["lora_alpha"], | |
| lora_dropout=TRAINING_CONFIG["lora_dropout"], bias="none", task_type="CAUSAL_LM", | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
| modules_to_save=["lm_head", "embed_tokens"], | |
| ) | |
| model = get_peft_model(model, lora_config) | |
| model.print_trainable_parameters() | |
| return model, tokenizer, lora_config | |
| def create_trainer(model, tokenizer, train_ds, eval_ds, lora_config): | |
| from trl import SFTTrainer, SFTConfig | |
| print_banner("CONFIGURING TRAINER") | |
| run_name = f"code-qwen-{datetime.now().strftime('%m%d-%H%M')}" | |
| report_to = [] | |
| try: | |
| import trackio | |
| trackio.init(project="code-llm", experiment="qlora-sft", run_name=run_name) | |
| report_to = ["trackio"] | |
| print("โ Trackio ็ฃๆงๅทฒๅๅ") | |
| except Exception: | |
| print("โ ๏ธ Trackio ไธๅฏ็จ๏ผไฝฟ็จ tensorboard") | |
| report_to = ["tensorboard"] | |
| training_args = SFTConfig( | |
| learning_rate=TRAINING_CONFIG["learning_rate"], lr_scheduler_type="cosine", | |
| warmup_ratio=TRAINING_CONFIG["warmup_ratio"], | |
| num_train_epochs=TRAINING_CONFIG["num_epochs"], | |
| per_device_train_batch_size=TRAINING_CONFIG["batch_size"], | |
| gradient_accumulation_steps=TRAINING_CONFIG["gradient_accumulation"], | |
| max_seq_length=TRAINING_CONFIG["max_seq_length"], | |
| gradient_checkpointing=True, bf16=True, fp16=False, | |
| optim="paged_adamw_8bit", packing=True, | |
| output_dir="./output_code", logging_steps=10, save_steps=1000, save_total_limit=2, | |
| eval_strategy="steps", eval_steps=1000, | |
| push_to_hub=True, hub_model_id=OUTPUT_DIR, hub_strategy="checkpoint", | |
| report_to=report_to, logging_strategy="steps", logging_first_step=True, | |
| remove_unused_columns=False, dataloader_num_workers=4, seed=42, | |
| ) | |
| trainer = SFTTrainer( | |
| model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds, | |
| processing_class=tokenizer, peft_config=lora_config, | |
| ) | |
| total_steps = len(train_ds) // (TRAINING_CONFIG["batch_size"] * TRAINING_CONFIG["gradient_accumulation"]) * TRAINING_CONFIG["num_epochs"] | |
| print(f"\n๐ ่จ็ทด่จๅ:") | |
| print(f" ๆธๆ้: {len(train_ds):,} samples") | |
| print(f" Batch size: {TRAINING_CONFIG['batch_size']} ร {TRAINING_CONFIG['gradient_accumulation']} = {TRAINING_CONFIG['batch_size'] * TRAINING_CONFIG['gradient_accumulation']}") | |
| print(f" Epochs: {TRAINING_CONFIG['num_epochs']}") | |
| print(f" ้ ไผฐๆญฅๆธ: ~{total_steps:,} steps") | |
| print(f" Packing: โ ๅ็จ") | |
| print(f" Optimizer: paged_adamw_8bit") | |
| print(f" ่ผธๅบไฝ็ฝฎ: https://huggingface.co/{OUTPUT_DIR}") | |
| return trainer, run_name | |
| def train(trainer): | |
| print_banner("TRAINING") | |
| print("๐ ้ๅง่จ็ทด...\n ๆ Ctrl+C ๅฏ้จๆไธญๆทไธฆไฟๅญ\n") | |
| try: | |
| result = trainer.train() | |
| print(f"\nโ ่จ็ทดๅฎๆ๏ผ Steps: {result.global_step}, Loss: {result.training_loss:.4f}") | |
| return True | |
| except KeyboardInterrupt: | |
| print("\nโ ๏ธ ่จ็ทด่ขซไธญๆท๏ผๆญฃๅจไฟๅญ...") | |
| trainer.save_model() | |
| return True | |
| except Exception as e: | |
| print(f"\nโ ่จ็ทดๅคฑๆ: {e}") | |
| raise | |
| def save_and_push(trainer): | |
| print_banner("SAVING & UPLOADING") | |
| try: | |
| print("๐ค ไธๅณๆจกๅๅฐ HuggingFace Hub...") | |
| trainer.push_to_hub() | |
| print(f"\nโ ๆจกๅๅทฒไธๅณ!\n๐ https://huggingface.co/{OUTPUT_DIR}") | |
| except Exception as e: | |
| print(f"โ ๏ธ ไธๅณๅคฑๆ: {e}\n ๆจกๅๅทฒไฟๅญๅจ ./output_code ็ฎ้") | |
| def main(): | |
| print(""" | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| โ Code LLM - QLoRA Fine-tuning โ | |
| โ Base: Qwen2.5-Coder-3B โ | |
| โ Data: 250K code samples (3 datasets) โ | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| """) | |
| check_environment() | |
| train_ds, eval_ds = load_datasets() | |
| model, tokenizer, lora_config = setup_model() | |
| trainer, run_name = create_trainer(model, tokenizer, train_ds, eval_ds, lora_config) | |
| success = train(trainer) | |
| if success: | |
| save_and_push(trainer) | |
| print_banner("DONE") | |
| print(f" Run: {run_name}\n Model: https://huggingface.co/{OUTPUT_DIR}") | |
| if __name__ == "__main__": | |
| main() | |