Spaces:

akhiilll
/

claims-env

Sleeping

akhiilll commited on 13 days ago

Commit

e893ade

verified ·

1 Parent(s): eed849b

make GRPOConfig kwargs version-tolerant

Files changed (1) hide show

training/train_grpo_hf_job.py CHANGED Viewed

@@ -288,30 +288,48 @@ assert sane_r > 0, f"reward fn broken (expected >0 on case 0, got {sane_r})"
 # 7. GRPO training
 # ---------------------------------------------------------------------------
-training_args = GRPOConfig(
-    output_dir=str(OUT_DIR),
-    learning_rate=LEARNING_RATE,
-    adam_beta1=0.9,
-    adam_beta2=0.99,
-    weight_decay=0.1,
-    warmup_ratio=0.1,
-    lr_scheduler_type="cosine",
-    optim="adamw_torch",
-    logging_steps=1,
-    per_device_train_batch_size=BATCH_SIZE,
-    gradient_accumulation_steps=GRAD_ACCUM,
-    num_generations=NUM_GENERATIONS,
-    max_prompt_length=MAX_PROMPT_LEN,
-    max_completion_length=MAX_COMPLETION_LEN,
-    max_steps=NUM_GRPO_STEPS,
-    save_steps=999_999,
-    report_to="none",
-    bf16=True,
-    temperature=0.9,
-    top_p=0.95,
-    epsilon=0.2,
-    beta=0.04,
-)
 trainer = GRPOTrainer(
     model=model,

 # 7. GRPO training
 # ---------------------------------------------------------------------------
+# Build kwargs incrementally and only pass args the installed TRL accepts -
+# the GRPOConfig surface has shifted across releases (max_prompt_length
+# disappeared in some, top_p / epsilon were renamed, etc.).
+import inspect
+_grpo_sig = inspect.signature(GRPOConfig.__init__).parameters
+_grpo_kwargs: dict = {
+    "output_dir": str(OUT_DIR),
+    "learning_rate": LEARNING_RATE,
+    "weight_decay": 0.1,
+    "warmup_ratio": 0.1,
+    "lr_scheduler_type": "cosine",
+    "optim": "adamw_torch",
+    "logging_steps": 1,
+    "per_device_train_batch_size": BATCH_SIZE,
+    "gradient_accumulation_steps": GRAD_ACCUM,
+    "num_generations": NUM_GENERATIONS,
+    "max_steps": NUM_GRPO_STEPS,
+    "save_steps": 999_999,
+    "report_to": "none",
+    "bf16": True,
+}
+_optional_kwargs: dict = {
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.99,
+    "max_prompt_length": MAX_PROMPT_LEN,
+    "max_completion_length": MAX_COMPLETION_LEN,
+    "temperature": 0.9,
+    "top_p": 0.95,
+    "epsilon": 0.2,
+    "beta": 0.04,
+}
+for k, v in _optional_kwargs.items():
+    if k in _grpo_sig:
+        _grpo_kwargs[k] = v
+    else:
+        print(f"[config] skipping unknown GRPOConfig arg: {k}")
+print("[config] GRPOConfig kwargs:", sorted(_grpo_kwargs.keys()))
+training_args = GRPOConfig(**_grpo_kwargs)
 trainer = GRPOTrainer(
     model=model,