Jayant-Kernel commited on
Commit ·
f788873
1
Parent(s): 354d3fd
update: 500 steps L1 + 300 steps L2, higher lr for 1.5B
Browse files- Dockerfile +1 -1
- train.py +5 -5
Dockerfile
CHANGED
|
@@ -22,4 +22,4 @@ COPY data/ /app/data/
|
|
| 22 |
COPY train.py .
|
| 23 |
COPY evaluate.py .
|
| 24 |
|
| 25 |
-
CMD ["python", "
|
|
|
|
| 22 |
COPY train.py .
|
| 23 |
COPY evaluate.py .
|
| 24 |
|
| 25 |
+
CMD ["python", "train.py"]
|
train.py
CHANGED
|
@@ -169,7 +169,7 @@ train_dataset = Dataset.from_list([
|
|
| 169 |
])
|
| 170 |
|
| 171 |
print("Starting training...")
|
| 172 |
-
wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
|
| 173 |
|
| 174 |
trainer = GRPOTrainer(
|
| 175 |
model=model,
|
|
@@ -179,10 +179,10 @@ trainer = GRPOTrainer(
|
|
| 179 |
output_dir="/tmp/deceit-1.5b",
|
| 180 |
bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
|
| 181 |
fp16=False,
|
| 182 |
-
max_steps=
|
| 183 |
per_device_train_batch_size=4,
|
| 184 |
num_generations=4,
|
| 185 |
-
learning_rate=
|
| 186 |
warmup_steps=5,
|
| 187 |
logging_steps=1,
|
| 188 |
save_steps=50,
|
|
@@ -267,7 +267,7 @@ def reward_fn_l2(completions, prompts=None, **kwargs):
|
|
| 267 |
|
| 268 |
# Train Level 2
|
| 269 |
print("Starting Level 2 training on 1.5B...")
|
| 270 |
-
wandb.init(project=WANDB_PROJECT, name="1.5b-level2")
|
| 271 |
|
| 272 |
trainer_l2 = GRPOTrainer(
|
| 273 |
model=model,
|
|
@@ -275,7 +275,7 @@ trainer_l2 = GRPOTrainer(
|
|
| 275 |
reward_funcs=[reward_fn_l2],
|
| 276 |
args=GRPOConfig(
|
| 277 |
output_dir="/tmp/deceit-1.5b-l2",
|
| 278 |
-
max_steps=
|
| 279 |
per_device_train_batch_size=4,
|
| 280 |
num_generations=4,
|
| 281 |
learning_rate=2e-6,
|
|
|
|
| 169 |
])
|
| 170 |
|
| 171 |
print("Starting training...")
|
| 172 |
+
wandb.init(project=WANDB_PROJECT, name="1.5b-level1-v2")
|
| 173 |
|
| 174 |
trainer = GRPOTrainer(
|
| 175 |
model=model,
|
|
|
|
| 179 |
output_dir="/tmp/deceit-1.5b",
|
| 180 |
bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
|
| 181 |
fp16=False,
|
| 182 |
+
max_steps=500,
|
| 183 |
per_device_train_batch_size=4,
|
| 184 |
num_generations=4,
|
| 185 |
+
learning_rate=1e-5,
|
| 186 |
warmup_steps=5,
|
| 187 |
logging_steps=1,
|
| 188 |
save_steps=50,
|
|
|
|
| 267 |
|
| 268 |
# Train Level 2
|
| 269 |
print("Starting Level 2 training on 1.5B...")
|
| 270 |
+
wandb.init(project=WANDB_PROJECT, name="1.5b-level2-v2")
|
| 271 |
|
| 272 |
trainer_l2 = GRPOTrainer(
|
| 273 |
model=model,
|
|
|
|
| 275 |
reward_funcs=[reward_fn_l2],
|
| 276 |
args=GRPOConfig(
|
| 277 |
output_dir="/tmp/deceit-1.5b-l2",
|
| 278 |
+
max_steps=300,
|
| 279 |
per_device_train_batch_size=4,
|
| 280 |
num_generations=4,
|
| 281 |
learning_rate=2e-6,
|