shank commited on
Commit Β·
bdec91d
1
Parent(s): db12eaa
Fix: Removed BitsandBytes
Browse files- training/train_grpo.py +10 -16
training/train_grpo.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
AgentDebuggerEnv β GRPO Training Script
|
| 3 |
-
Model: Qwen2.5-Coder-7B-Instruct (
|
| 4 |
Algorithm: GRPO (Group Relative Policy Optimization) via HuggingFace TRL
|
| 5 |
GPU: auto-detected at runtime (A100/H100 β bfloat16+large batch, T4/V100 β float16+small batch)
|
| 6 |
|
|
@@ -50,7 +50,6 @@ if not args.test_local:
|
|
| 50 |
"accelerate==1.0.1",
|
| 51 |
"trl==0.15.2",
|
| 52 |
"peft==0.13.2",
|
| 53 |
-
"bitsandbytes==0.45.5",
|
| 54 |
]
|
| 55 |
print("Installing training dependencies...", flush=True)
|
| 56 |
ret = os.system(
|
|
@@ -67,7 +66,7 @@ if not args.test_local:
|
|
| 67 |
import wandb
|
| 68 |
from datasets import Dataset
|
| 69 |
from transformers import (
|
| 70 |
-
AutoModelForCausalLM, AutoTokenizer,
|
| 71 |
)
|
| 72 |
from peft import get_peft_model, LoraConfig, TaskType
|
| 73 |
from trl import GRPOTrainer, GRPOConfig
|
|
@@ -86,7 +85,7 @@ if not args.test_local:
|
|
| 86 |
f"trl={_pkg_ver('trl')} "
|
| 87 |
f"accelerate={_pkg_ver('accelerate')} "
|
| 88 |
f"peft={_pkg_ver('peft')} "
|
| 89 |
-
f"
|
| 90 |
)
|
| 91 |
|
| 92 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
@@ -291,12 +290,12 @@ if _gpu_vram_gb >= 40: # A100 40GB / A100 80GB
|
|
| 291 |
_num_gen = 8
|
| 292 |
_max_comp = 256
|
| 293 |
_lora_r = 16
|
| 294 |
-
elif _gpu_vram_gb >= 20: # V100 32GB
|
| 295 |
_batch = 1
|
| 296 |
_grad_accum = 8
|
| 297 |
-
_num_gen =
|
| 298 |
-
_max_comp =
|
| 299 |
-
_lora_r =
|
| 300 |
else: # T4 15GB / anything smaller
|
| 301 |
_batch = 1
|
| 302 |
_grad_accum = 8
|
|
@@ -309,20 +308,15 @@ print(f"Training config: batch={_batch} grad_accum={_grad_accum} "
|
|
| 309 |
f"dtype={COMPUTE_DTYPE}")
|
| 310 |
|
| 311 |
# ββ Load model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
bnb_4bit_quant_type="nf4",
|
| 316 |
-
bnb_4bit_compute_dtype=COMPUTE_DTYPE,
|
| 317 |
-
bnb_4bit_use_double_quant=True,
|
| 318 |
-
)
|
| 319 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 320 |
tokenizer.pad_token = tokenizer.eos_token
|
| 321 |
tokenizer.padding_side = "left"
|
| 322 |
|
| 323 |
model = AutoModelForCausalLM.from_pretrained(
|
| 324 |
MODEL_NAME,
|
| 325 |
-
quantization_config=bnb_config,
|
| 326 |
device_map="auto",
|
| 327 |
trust_remote_code=True,
|
| 328 |
torch_dtype=COMPUTE_DTYPE,
|
|
|
|
| 1 |
"""
|
| 2 |
AgentDebuggerEnv β GRPO Training Script
|
| 3 |
+
Model: Qwen2.5-Coder-7B-Instruct (float16/bfloat16 + LoRA, no quantization)
|
| 4 |
Algorithm: GRPO (Group Relative Policy Optimization) via HuggingFace TRL
|
| 5 |
GPU: auto-detected at runtime (A100/H100 β bfloat16+large batch, T4/V100 β float16+small batch)
|
| 6 |
|
|
|
|
| 50 |
"accelerate==1.0.1",
|
| 51 |
"trl==0.15.2",
|
| 52 |
"peft==0.13.2",
|
|
|
|
| 53 |
]
|
| 54 |
print("Installing training dependencies...", flush=True)
|
| 55 |
ret = os.system(
|
|
|
|
| 66 |
import wandb
|
| 67 |
from datasets import Dataset
|
| 68 |
from transformers import (
|
| 69 |
+
AutoModelForCausalLM, AutoTokenizer, TrainerCallback
|
| 70 |
)
|
| 71 |
from peft import get_peft_model, LoraConfig, TaskType
|
| 72 |
from trl import GRPOTrainer, GRPOConfig
|
|
|
|
| 85 |
f"trl={_pkg_ver('trl')} "
|
| 86 |
f"accelerate={_pkg_ver('accelerate')} "
|
| 87 |
f"peft={_pkg_ver('peft')} "
|
| 88 |
+
f"dtype={COMPUTE_DTYPE}"
|
| 89 |
)
|
| 90 |
|
| 91 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
| 290 |
_num_gen = 8
|
| 291 |
_max_comp = 256
|
| 292 |
_lora_r = 16
|
| 293 |
+
elif _gpu_vram_gb >= 20: # A10G 24GB / V100 32GB β float16 model ~14GB
|
| 294 |
_batch = 1
|
| 295 |
_grad_accum = 8
|
| 296 |
+
_num_gen = 4
|
| 297 |
+
_max_comp = 192
|
| 298 |
+
_lora_r = 8
|
| 299 |
else: # T4 15GB / anything smaller
|
| 300 |
_batch = 1
|
| 301 |
_grad_accum = 8
|
|
|
|
| 308 |
f"dtype={COMPUTE_DTYPE}")
|
| 309 |
|
| 310 |
# ββ Load model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 311 |
+
# Load in native float16/bfloat16 β no bitsandbytes needed.
|
| 312 |
+
# A10G (24GB) fits Qwen2.5-7B in float16 (~14GB) with room for LoRA + activations.
|
| 313 |
+
print(f"Loading {MODEL_NAME} in {COMPUTE_DTYPE}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 315 |
tokenizer.pad_token = tokenizer.eos_token
|
| 316 |
tokenizer.padding_side = "left"
|
| 317 |
|
| 318 |
model = AutoModelForCausalLM.from_pretrained(
|
| 319 |
MODEL_NAME,
|
|
|
|
| 320 |
device_map="auto",
|
| 321 |
trust_remote_code=True,
|
| 322 |
torch_dtype=COMPUTE_DTYPE,
|