Jayant-Kernel commited on
upgrade: Qwen 1.5B model, 150 L1 + 80 L2 steps
Browse files
train.py
CHANGED
|
@@ -39,8 +39,8 @@ login(token=os.environ["HF_TOKEN"])
|
|
| 39 |
wandb.login(key=os.environ["WANDB_API_KEY"])
|
| 40 |
|
| 41 |
# Config
|
| 42 |
-
MODEL_NAME = "unsloth/Qwen2.5-
|
| 43 |
-
HF_REPO_ID = "Ajsaxena/deceit-qwen-
|
| 44 |
WANDB_PROJECT = "deceit-full"
|
| 45 |
LORA_RANK = 16
|
| 46 |
|
|
@@ -168,7 +168,7 @@ train_dataset = Dataset.from_list([
|
|
| 168 |
# Train — Level 1 (200 steps)
|
| 169 |
print("Starting Level 1 training...")
|
| 170 |
FastLanguageModel.for_training(model)
|
| 171 |
-
wandb.init(project=WANDB_PROJECT, name="
|
| 172 |
|
| 173 |
trainer = GRPOTrainer(
|
| 174 |
model=model,
|
|
@@ -176,7 +176,7 @@ trainer = GRPOTrainer(
|
|
| 176 |
reward_funcs=[reward_fn],
|
| 177 |
args=GRPOConfig(
|
| 178 |
output_dir="./deceit-full",
|
| 179 |
-
max_steps=
|
| 180 |
per_device_train_batch_size=2,
|
| 181 |
num_generations=4,
|
| 182 |
learning_rate=5e-6,
|
|
@@ -263,14 +263,14 @@ def reward_fn_l2(completions, prompts=None, **kwargs):
|
|
| 263 |
|
| 264 |
# Train Level 2 (100 steps)
|
| 265 |
print("Starting Level 2 training...")
|
| 266 |
-
wandb.init(project=WANDB_PROJECT, name="
|
| 267 |
trainer_l2 = GRPOTrainer(
|
| 268 |
model=model,
|
| 269 |
processing_class=tokenizer,
|
| 270 |
reward_funcs=[reward_fn_l2],
|
| 271 |
args=GRPOConfig(
|
| 272 |
output_dir="./deceit-full-l2",
|
| 273 |
-
max_steps=
|
| 274 |
per_device_train_batch_size=2,
|
| 275 |
num_generations=4,
|
| 276 |
learning_rate=2e-6,
|
|
|
|
| 39 |
wandb.login(key=os.environ["WANDB_API_KEY"])
|
| 40 |
|
| 41 |
# Config
|
| 42 |
+
MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct"
|
| 43 |
+
HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
|
| 44 |
WANDB_PROJECT = "deceit-full"
|
| 45 |
LORA_RANK = 16
|
| 46 |
|
|
|
|
| 168 |
# Train — Level 1 (200 steps)
|
| 169 |
print("Starting Level 1 training...")
|
| 170 |
FastLanguageModel.for_training(model)
|
| 171 |
+
wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
|
| 172 |
|
| 173 |
trainer = GRPOTrainer(
|
| 174 |
model=model,
|
|
|
|
| 176 |
reward_funcs=[reward_fn],
|
| 177 |
args=GRPOConfig(
|
| 178 |
output_dir="./deceit-full",
|
| 179 |
+
max_steps=150,
|
| 180 |
per_device_train_batch_size=2,
|
| 181 |
num_generations=4,
|
| 182 |
learning_rate=5e-6,
|
|
|
|
| 263 |
|
| 264 |
# Train Level 2 (100 steps)
|
| 265 |
print("Starting Level 2 training...")
|
| 266 |
+
wandb.init(project=WANDB_PROJECT, name="1.5b-level2")
|
| 267 |
trainer_l2 = GRPOTrainer(
|
| 268 |
model=model,
|
| 269 |
processing_class=tokenizer,
|
| 270 |
reward_funcs=[reward_fn_l2],
|
| 271 |
args=GRPOConfig(
|
| 272 |
output_dir="./deceit-full-l2",
|
| 273 |
+
max_steps=80,
|
| 274 |
per_device_train_batch_size=2,
|
| 275 |
num_generations=4,
|
| 276 |
learning_rate=2e-6,
|