Jayant-Kernel commited on
Commit
f788873
·
1 Parent(s): 354d3fd

update: 500 steps L1 + 300 steps L2, higher lr for 1.5B

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. train.py +5 -5
Dockerfile CHANGED
@@ -22,4 +22,4 @@ COPY data/ /app/data/
22
  COPY train.py .
23
  COPY evaluate.py .
24
 
25
- CMD ["python", "evaluate.py"]
 
22
  COPY train.py .
23
  COPY evaluate.py .
24
 
25
+ CMD ["python", "train.py"]
train.py CHANGED
@@ -169,7 +169,7 @@ train_dataset = Dataset.from_list([
169
  ])
170
 
171
  print("Starting training...")
172
- wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
173
 
174
  trainer = GRPOTrainer(
175
  model=model,
@@ -179,10 +179,10 @@ trainer = GRPOTrainer(
179
  output_dir="/tmp/deceit-1.5b",
180
  bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
181
  fp16=False,
182
- max_steps=150,
183
  per_device_train_batch_size=4,
184
  num_generations=4,
185
- learning_rate=5e-6,
186
  warmup_steps=5,
187
  logging_steps=1,
188
  save_steps=50,
@@ -267,7 +267,7 @@ def reward_fn_l2(completions, prompts=None, **kwargs):
267
 
268
  # Train Level 2
269
  print("Starting Level 2 training on 1.5B...")
270
- wandb.init(project=WANDB_PROJECT, name="1.5b-level2")
271
 
272
  trainer_l2 = GRPOTrainer(
273
  model=model,
@@ -275,7 +275,7 @@ trainer_l2 = GRPOTrainer(
275
  reward_funcs=[reward_fn_l2],
276
  args=GRPOConfig(
277
  output_dir="/tmp/deceit-1.5b-l2",
278
- max_steps=80,
279
  per_device_train_batch_size=4,
280
  num_generations=4,
281
  learning_rate=2e-6,
 
169
  ])
170
 
171
  print("Starting training...")
172
+ wandb.init(project=WANDB_PROJECT, name="1.5b-level1-v2")
173
 
174
  trainer = GRPOTrainer(
175
  model=model,
 
179
  output_dir="/tmp/deceit-1.5b",
180
  bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
181
  fp16=False,
182
+ max_steps=500,
183
  per_device_train_batch_size=4,
184
  num_generations=4,
185
+ learning_rate=1e-5,
186
  warmup_steps=5,
187
  logging_steps=1,
188
  save_steps=50,
 
267
 
268
  # Train Level 2
269
  print("Starting Level 2 training on 1.5B...")
270
+ wandb.init(project=WANDB_PROJECT, name="1.5b-level2-v2")
271
 
272
  trainer_l2 = GRPOTrainer(
273
  model=model,
 
275
  reward_funcs=[reward_fn_l2],
276
  args=GRPOConfig(
277
  output_dir="/tmp/deceit-1.5b-l2",
278
+ max_steps=300,
279
  per_device_train_batch_size=4,
280
  num_generations=4,
281
  learning_rate=2e-6,