Justin-lee commited on
Commit
28bf24b
ยท
verified ยท
1 Parent(s): f4a8336

Add Code LLM training script

Browse files
Files changed (1) hide show
  1. code_llm_train.py +261 -0
code_llm_train.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Code LLM - QLoRA Fine-tuning Script
5
+ ====================================
6
+ Base Model: Qwen/Qwen2.5-Coder-3B
7
+ Method: QLoRA SFT (4-bit NF4 + LoRA r=64)
8
+ Datasets: Code-Feedback (66K) + Magicoder-OSS (75K) + Evol-CodeAlpaca (110K) = ~250K
9
+
10
+ Hardware: RTX 3070 (8GB VRAM) or any GPU >= 8GB
11
+ Training time: ~6-8 hours (3 epochs)
12
+
13
+ Usage:
14
+ pip install -r requirements_code.txt
15
+ python code_llm_train.py
16
+ """
17
+
18
+ import os
19
+ import sys
20
+ import torch
21
+ from datetime import datetime
22
+
23
+ # ============================================================
24
+ # CONFIGURATION - ่ซ‹ไฟฎๆ”น้€™่ฃก
25
+ # ============================================================
26
+ MODEL_NAME = "Qwen/Qwen2.5-Coder-3B"
27
+ HF_USERNAME = "YOUR_HF_USERNAME" # ๆ”นๆˆไฝ ็š„ HuggingFace ็”จๆˆถๅ
28
+
29
+ # ่จ“็ทด่ถ…ๅƒๆ•ธ (RTX 3070 8GB ๅ„ชๅŒ–)
30
+ TRAINING_CONFIG = {
31
+ "learning_rate": 2e-4,
32
+ "num_epochs": 3,
33
+ "batch_size": 1,
34
+ "gradient_accumulation": 16,
35
+ "max_seq_length": 2048,
36
+ "lora_r": 64,
37
+ "lora_alpha": 128,
38
+ "lora_dropout": 0.05,
39
+ "warmup_ratio": 0.05,
40
+ }
41
+
42
+ OUTPUT_DIR = f"{HF_USERNAME}/code-qwen2.5-coder-3b"
43
+ # ============================================================
44
+
45
+
46
+ def print_banner(text):
47
+ print(f"\n{'='*60}")
48
+ print(f" {text}")
49
+ print(f"{'='*60}")
50
+
51
+
52
+ def check_environment():
53
+ print_banner("ENVIRONMENT CHECK")
54
+ if torch.cuda.is_available():
55
+ gpu_name = torch.cuda.get_device_name(0)
56
+ vram = torch.cuda.get_device_properties(0).total_mem / 1024**3
57
+ print(f"โœ… GPU: {gpu_name}")
58
+ print(f" VRAM: {vram:.1f} GB")
59
+ if vram < 7:
60
+ print("โš ๏ธ VRAM < 8GB, ๅฏ่ƒฝๆœƒ OOM๏ผŒๅปบ่ญฐ้™ไฝŽ max_seq_length ๅˆฐ 1024")
61
+ else:
62
+ print("โŒ ๆฒ’ๆœ‰ๅตๆธฌๅˆฐ GPU๏ผๆญค่…ณๆœฌ้œ€่ฆ NVIDIA GPU")
63
+ sys.exit(1)
64
+
65
+ required = ["transformers", "trl", "peft", "bitsandbytes", "accelerate", "datasets"]
66
+ missing = []
67
+ for pkg in required:
68
+ try:
69
+ __import__(pkg)
70
+ print(f"โœ… {pkg}")
71
+ except ImportError:
72
+ missing.append(pkg)
73
+ print(f"โŒ {pkg}")
74
+ if missing:
75
+ print(f"\n่ซ‹้‹่กŒ: pip install {' '.join(missing)}")
76
+ sys.exit(1)
77
+
78
+
79
+ def load_datasets():
80
+ from datasets import load_dataset, concatenate_datasets
81
+
82
+ print_banner("LOADING DATASETS")
83
+
84
+ print("๐Ÿ“ฆ [1/3] Code-Feedback (66K multi-turn coding chat)...")
85
+ code_feedback = load_dataset("m-a-p/Code-Feedback", split="train")
86
+ cf_msgs = code_feedback.map(
87
+ lambda x: {"messages": x["messages"]},
88
+ remove_columns=[c for c in code_feedback.column_names if c != "messages"],
89
+ )
90
+ print(f" โœ… {len(cf_msgs)} samples loaded")
91
+
92
+ print("๐Ÿ“ฆ [2/3] Magicoder-OSS-Instruct (75K real GitHub seeds)...")
93
+ magicoder = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")
94
+ def convert_magicoder(example):
95
+ return {"messages": [
96
+ {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."},
97
+ {"role": "user", "content": example["problem"]},
98
+ {"role": "assistant", "content": example["solution"]},
99
+ ]}
100
+ mc_msgs = magicoder.map(convert_magicoder, remove_columns=magicoder.column_names)
101
+ print(f" โœ… {len(mc_msgs)} samples converted")
102
+
103
+ print("๐Ÿ“ฆ [3/3] Evol-CodeAlpaca (110K complexity-evolved)...")
104
+ evol = load_dataset("theblackcat102/evol-codealpaca-v1", split="train")
105
+ def convert_evol(example):
106
+ return {"messages": [
107
+ {"role": "system", "content": "You are an exceptionally skilled programmer. Write clean, efficient, well-documented code."},
108
+ {"role": "user", "content": example["instruction"]},
109
+ {"role": "assistant", "content": example["output"]},
110
+ ]}
111
+ evol_msgs = evol.map(convert_evol, remove_columns=evol.column_names)
112
+ print(f" โœ… {len(evol_msgs)} samples converted")
113
+
114
+ print("\n๐Ÿ”„ ๅˆไฝตๆ•ธๆ“š้›†...")
115
+ combined = concatenate_datasets([cf_msgs, mc_msgs, evol_msgs]).shuffle(seed=42)
116
+ split = combined.train_test_split(test_size=0.02, seed=42)
117
+ train_ds, eval_ds = split["train"], split["test"]
118
+
119
+ print(f"\n๐Ÿ“Š ๆ•ธๆ“š้›†็ตฑ่จˆ:")
120
+ print(f" Code-Feedback: {len(cf_msgs):>7,} samples")
121
+ print(f" Magicoder-OSS: {len(mc_msgs):>7,} samples")
122
+ print(f" Evol-CodeAlpaca:{len(evol_msgs):>7,} samples")
123
+ print(f" {'โ”€'*35}")
124
+ print(f" ็ธฝ่จˆ่จ“็ทด: {len(train_ds):>7,} samples")
125
+ print(f" ็ธฝ่จˆ้ฉ—่ญ‰: {len(eval_ds):>7,} samples")
126
+ return train_ds, eval_ds
127
+
128
+
129
+ def setup_model():
130
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
131
+ from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
132
+
133
+ print_banner("LOADING MODEL")
134
+ print(f"๐Ÿค– Model: {MODEL_NAME}")
135
+
136
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
137
+ if tokenizer.pad_token is None:
138
+ tokenizer.pad_token = tokenizer.eos_token
139
+ tokenizer.padding_side = "right"
140
+ print(f" Vocab: {len(tokenizer):,} tokens")
141
+
142
+ print("\nโšก ้…็ฝฎ QLoRA (4-bit NF4 + double quant)...")
143
+ bnb_config = BitsAndBytesConfig(
144
+ load_in_4bit=True, bnb_4bit_quant_type="nf4",
145
+ bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
146
+ )
147
+
148
+ print("๐Ÿ“ฅ ่ผ‰ๅ…ฅๆจกๅž‹...")
149
+ model = AutoModelForCausalLM.from_pretrained(
150
+ MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True,
151
+ )
152
+ model = prepare_model_for_kbit_training(model)
153
+ print("โœ… ๆจกๅž‹ๆบ–ๅ‚™ๅฎŒๆˆ")
154
+
155
+ print(f"\n๐Ÿ”ง ้…็ฝฎ LoRA (r={TRAINING_CONFIG['lora_r']}, alpha={TRAINING_CONFIG['lora_alpha']})...")
156
+ lora_config = LoraConfig(
157
+ r=TRAINING_CONFIG["lora_r"], lora_alpha=TRAINING_CONFIG["lora_alpha"],
158
+ lora_dropout=TRAINING_CONFIG["lora_dropout"], bias="none", task_type="CAUSAL_LM",
159
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
160
+ modules_to_save=["lm_head", "embed_tokens"],
161
+ )
162
+ model = get_peft_model(model, lora_config)
163
+ model.print_trainable_parameters()
164
+ return model, tokenizer, lora_config
165
+
166
+
167
+ def create_trainer(model, tokenizer, train_ds, eval_ds, lora_config):
168
+ from trl import SFTTrainer, SFTConfig
169
+
170
+ print_banner("CONFIGURING TRAINER")
171
+ run_name = f"code-qwen-{datetime.now().strftime('%m%d-%H%M')}"
172
+
173
+ report_to = []
174
+ try:
175
+ import trackio
176
+ trackio.init(project="code-llm", experiment="qlora-sft", run_name=run_name)
177
+ report_to = ["trackio"]
178
+ print("โœ… Trackio ็›ฃๆŽงๅทฒๅ•Ÿๅ‹•")
179
+ except Exception:
180
+ print("โš ๏ธ Trackio ไธๅฏ็”จ๏ผŒไฝฟ็”จ tensorboard")
181
+ report_to = ["tensorboard"]
182
+
183
+ training_args = SFTConfig(
184
+ learning_rate=TRAINING_CONFIG["learning_rate"], lr_scheduler_type="cosine",
185
+ warmup_ratio=TRAINING_CONFIG["warmup_ratio"],
186
+ num_train_epochs=TRAINING_CONFIG["num_epochs"],
187
+ per_device_train_batch_size=TRAINING_CONFIG["batch_size"],
188
+ gradient_accumulation_steps=TRAINING_CONFIG["gradient_accumulation"],
189
+ max_seq_length=TRAINING_CONFIG["max_seq_length"],
190
+ gradient_checkpointing=True, bf16=True, fp16=False,
191
+ optim="paged_adamw_8bit", packing=True,
192
+ output_dir="./output_code", logging_steps=10, save_steps=1000, save_total_limit=2,
193
+ eval_strategy="steps", eval_steps=1000,
194
+ push_to_hub=True, hub_model_id=OUTPUT_DIR, hub_strategy="checkpoint",
195
+ report_to=report_to, logging_strategy="steps", logging_first_step=True,
196
+ remove_unused_columns=False, dataloader_num_workers=4, seed=42,
197
+ )
198
+
199
+ trainer = SFTTrainer(
200
+ model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds,
201
+ processing_class=tokenizer, peft_config=lora_config,
202
+ )
203
+
204
+ total_steps = len(train_ds) // (TRAINING_CONFIG["batch_size"] * TRAINING_CONFIG["gradient_accumulation"]) * TRAINING_CONFIG["num_epochs"]
205
+ print(f"\n๐Ÿ“‹ ่จ“็ทด่จˆๅŠƒ:")
206
+ print(f" ๆ•ธๆ“š้‡: {len(train_ds):,} samples")
207
+ print(f" Batch size: {TRAINING_CONFIG['batch_size']} ร— {TRAINING_CONFIG['gradient_accumulation']} = {TRAINING_CONFIG['batch_size'] * TRAINING_CONFIG['gradient_accumulation']}")
208
+ print(f" Epochs: {TRAINING_CONFIG['num_epochs']}")
209
+ print(f" ้ ไผฐๆญฅๆ•ธ: ~{total_steps:,} steps")
210
+ print(f" Packing: โœ… ๅ•Ÿ็”จ")
211
+ print(f" Optimizer: paged_adamw_8bit")
212
+ print(f" ่ผธๅ‡บไฝ็ฝฎ: https://huggingface.co/{OUTPUT_DIR}")
213
+ return trainer, run_name
214
+
215
+
216
+ def train(trainer):
217
+ print_banner("TRAINING")
218
+ print("๐Ÿš€ ้–‹ๅง‹่จ“็ทด...\n ๆŒ‰ Ctrl+C ๅฏ้šจๆ™‚ไธญๆ–ทไธฆไฟๅญ˜\n")
219
+ try:
220
+ result = trainer.train()
221
+ print(f"\nโœ… ่จ“็ทดๅฎŒๆˆ๏ผ Steps: {result.global_step}, Loss: {result.training_loss:.4f}")
222
+ return True
223
+ except KeyboardInterrupt:
224
+ print("\nโš ๏ธ ่จ“็ทด่ขซไธญๆ–ท๏ผŒๆญฃๅœจไฟๅญ˜...")
225
+ trainer.save_model()
226
+ return True
227
+ except Exception as e:
228
+ print(f"\nโŒ ่จ“็ทดๅคฑๆ•—: {e}")
229
+ raise
230
+
231
+
232
+ def save_and_push(trainer):
233
+ print_banner("SAVING & UPLOADING")
234
+ try:
235
+ print("๐Ÿ“ค ไธŠๅ‚ณๆจกๅž‹ๅˆฐ HuggingFace Hub...")
236
+ trainer.push_to_hub()
237
+ print(f"\nโœ… ๆจกๅž‹ๅทฒไธŠๅ‚ณ!\n๐Ÿ”— https://huggingface.co/{OUTPUT_DIR}")
238
+ except Exception as e:
239
+ print(f"โš ๏ธ ไธŠๅ‚ณๅคฑๆ•—: {e}\n ๆจกๅž‹ๅทฒไฟๅญ˜ๅœจ ./output_code ็›ฎ้Œ„")
240
+
241
+
242
+ def main():
243
+ print("""
244
+ โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
245
+ โ•‘ Code LLM - QLoRA Fine-tuning โ•‘
246
+ โ•‘ Base: Qwen2.5-Coder-3B โ•‘
247
+ โ•‘ Data: 250K code samples (3 datasets) โ•‘
248
+ โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•๏ฟฝ๏ฟฝโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
249
+ """)
250
+ check_environment()
251
+ train_ds, eval_ds = load_datasets()
252
+ model, tokenizer, lora_config = setup_model()
253
+ trainer, run_name = create_trainer(model, tokenizer, train_ds, eval_ds, lora_config)
254
+ success = train(trainer)
255
+ if success:
256
+ save_and_push(trainer)
257
+ print_banner("DONE")
258
+ print(f" Run: {run_name}\n Model: https://huggingface.co/{OUTPUT_DIR}")
259
+
260
+ if __name__ == "__main__":
261
+ main()