Upload train_math.py
Browse files- train_math.py +3 -3
train_math.py
CHANGED
|
@@ -18,7 +18,7 @@ LR = 2e-4
|
|
| 18 |
PER_DEVICE_BATCH = 4
|
| 19 |
GRADIENT_ACCUMULATION = 32
|
| 20 |
MAX_SEQ_LENGTH = 4096
|
| 21 |
-
|
| 22 |
LORA_R = 32
|
| 23 |
LORA_ALPHA = 16
|
| 24 |
LORA_DROPOUT = 0.05
|
|
@@ -98,7 +98,7 @@ def main():
|
|
| 98 |
per_device_train_batch_size=PER_DEVICE_BATCH,
|
| 99 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION,
|
| 100 |
learning_rate=LR, bf16=True,
|
| 101 |
-
lr_scheduler_type="cosine",
|
| 102 |
logging_steps=10, save_strategy="epoch", save_total_limit=2,
|
| 103 |
gradient_checkpointing=True, push_to_hub=True, hub_model_id=HUB_MODEL_ID,
|
| 104 |
hub_private_repo=False, report_to="trackio", disable_tqdm=True,
|
|
@@ -107,7 +107,7 @@ def main():
|
|
| 107 |
|
| 108 |
print("Initializing SFTTrainer...")
|
| 109 |
trainer = SFTTrainer(
|
| 110 |
-
model=model,
|
| 111 |
peft_config=peft_config, args=training_args,
|
| 112 |
max_seq_length=MAX_SEQ_LENGTH,
|
| 113 |
)
|
|
|
|
| 18 |
PER_DEVICE_BATCH = 4
|
| 19 |
GRADIENT_ACCUMULATION = 32
|
| 20 |
MAX_SEQ_LENGTH = 4096
|
| 21 |
+
WARMUP_STEPS = 500
|
| 22 |
LORA_R = 32
|
| 23 |
LORA_ALPHA = 16
|
| 24 |
LORA_DROPOUT = 0.05
|
|
|
|
| 98 |
per_device_train_batch_size=PER_DEVICE_BATCH,
|
| 99 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION,
|
| 100 |
learning_rate=LR, bf16=True,
|
| 101 |
+
lr_scheduler_type="cosine", warmup_steps=WARMUP_STEPS,
|
| 102 |
logging_steps=10, save_strategy="epoch", save_total_limit=2,
|
| 103 |
gradient_checkpointing=True, push_to_hub=True, hub_model_id=HUB_MODEL_ID,
|
| 104 |
hub_private_repo=False, report_to="trackio", disable_tqdm=True,
|
|
|
|
| 107 |
|
| 108 |
print("Initializing SFTTrainer...")
|
| 109 |
trainer = SFTTrainer(
|
| 110 |
+
model=model, processing_class=tokenizer, train_dataset=train_dataset,
|
| 111 |
peft_config=peft_config, args=training_args,
|
| 112 |
max_seq_length=MAX_SEQ_LENGTH,
|
| 113 |
)
|