Jayant-Kernel commited on
Commit
8e853cb
·
unverified ·
1 Parent(s): 76117fc

upgrade: Qwen 1.5B model, 150 L1 + 80 L2 steps

Browse files
Files changed (1) hide show
  1. train.py +6 -6
train.py CHANGED
@@ -39,8 +39,8 @@ login(token=os.environ["HF_TOKEN"])
39
  wandb.login(key=os.environ["WANDB_API_KEY"])
40
 
41
  # Config
42
- MODEL_NAME = "unsloth/Qwen2.5-0.5B-Instruct"
43
- HF_REPO_ID = "Ajsaxena/deceit-qwen-0.5b-full"
44
  WANDB_PROJECT = "deceit-full"
45
  LORA_RANK = 16
46
 
@@ -168,7 +168,7 @@ train_dataset = Dataset.from_list([
168
  # Train — Level 1 (200 steps)
169
  print("Starting Level 1 training...")
170
  FastLanguageModel.for_training(model)
171
- wandb.init(project=WANDB_PROJECT, name="full-level1")
172
 
173
  trainer = GRPOTrainer(
174
  model=model,
@@ -176,7 +176,7 @@ trainer = GRPOTrainer(
176
  reward_funcs=[reward_fn],
177
  args=GRPOConfig(
178
  output_dir="./deceit-full",
179
- max_steps=200,
180
  per_device_train_batch_size=2,
181
  num_generations=4,
182
  learning_rate=5e-6,
@@ -263,14 +263,14 @@ def reward_fn_l2(completions, prompts=None, **kwargs):
263
 
264
  # Train Level 2 (100 steps)
265
  print("Starting Level 2 training...")
266
- wandb.init(project=WANDB_PROJECT, name="full-level2")
267
  trainer_l2 = GRPOTrainer(
268
  model=model,
269
  processing_class=tokenizer,
270
  reward_funcs=[reward_fn_l2],
271
  args=GRPOConfig(
272
  output_dir="./deceit-full-l2",
273
- max_steps=100,
274
  per_device_train_batch_size=2,
275
  num_generations=4,
276
  learning_rate=2e-6,
 
39
  wandb.login(key=os.environ["WANDB_API_KEY"])
40
 
41
  # Config
42
+ MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct"
43
+ HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
44
  WANDB_PROJECT = "deceit-full"
45
  LORA_RANK = 16
46
 
 
168
  # Train — Level 1 (200 steps)
169
  print("Starting Level 1 training...")
170
  FastLanguageModel.for_training(model)
171
+ wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
172
 
173
  trainer = GRPOTrainer(
174
  model=model,
 
176
  reward_funcs=[reward_fn],
177
  args=GRPOConfig(
178
  output_dir="./deceit-full",
179
+ max_steps=150,
180
  per_device_train_batch_size=2,
181
  num_generations=4,
182
  learning_rate=5e-6,
 
263
 
264
  # Train Level 2 (100 steps)
265
  print("Starting Level 2 training...")
266
+ wandb.init(project=WANDB_PROJECT, name="1.5b-level2")
267
  trainer_l2 = GRPOTrainer(
268
  model=model,
269
  processing_class=tokenizer,
270
  reward_funcs=[reward_fn_l2],
271
  args=GRPOConfig(
272
  output_dir="./deceit-full-l2",
273
+ max_steps=80,
274
  per_device_train_batch_size=2,
275
  num_generations=4,
276
  learning_rate=2e-6,