Spaces:

helloAK96
/

chaosops

Running

helloAK96 Claude Opus 4.7 commited on 15 days ago

Commit

6e35cec

1 Parent(s): 8878953

GRPO: expose --learning-rate, --temperature, --curriculum-schedule

Phase-1 submission upgrade for the hackathon: previous run used the
TRL default LR (5e-6) with EASY-only data and saw a flat reward curve
+ KL=0.14. Three new knobs let us re-target without code edits:

* --learning-rate (default 5e-6, set 2e-5 to break the flat-reward
plateau without touching anything else)
* --temperature (default 0.7, set 0.8 for more exploration in Phase 2)
* --curriculum-schedule "easy:200,medium:200,hard:200" — pre-rolls a
step-budget tier sequence so GRPO sees increasing difficulty over
training instead of EASY for all 600 steps. Falls back to the old
--start-tier behavior when the flag isn't passed.

scripts/jobs_grpo_train.sh forwards GRPO_LR / GRPO_TEMP /
GRPO_CURRICULUM env vars; everything else is unchanged. 110/110 unit
tests pass (helper is a no-op when the schedule env var isn't set).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

scripts/jobs_grpo_train.sh +19 -9
train/grpo_train.py +79 -2

scripts/jobs_grpo_train.sh CHANGED Viewed

@@ -26,6 +26,9 @@ GRPO_LORA_RANK="${GRPO_LORA_RANK:-16}"
 GRPO_LOG_EVERY="${GRPO_LOG_EVERY:-1}"
 GRPO_MAX_SEQ_LENGTH="${GRPO_MAX_SEQ_LENGTH:-1024}"
 GRPO_PUSH_TO_HUB="${GRPO_PUSH_TO_HUB:-0}"
 HUB_REPO_ID="${HUB_REPO_ID:-helloAK96/chaosops-grpo-lora}"
 OUTPUT_DIR="/workspace/artifacts/chaosops-grpo"
@@ -63,16 +66,23 @@ mkdir -p "${OUTPUT_DIR}"
 GRPO_BACKEND="${GRPO_BACKEND:-transformers}"
-echo "==[chaosops]== launching GRPO (backend=$GRPO_BACKEND, $GRPO_EPISODES episodes, group=$GRPO_GROUP_SIZE, lora_rank=$GRPO_LORA_RANK)"
-python -m chaosops.train.grpo_train \
-  --model-name "${GRPO_MODEL}" \
-  --backend "${GRPO_BACKEND}" \
-  --total-episodes "${GRPO_EPISODES}" \
-  --group-size "${GRPO_GROUP_SIZE}" \
-  --log-every "${GRPO_LOG_EVERY}" \
-  --max-seq-length "${GRPO_MAX_SEQ_LENGTH}" \
-  --lora-rank "${GRPO_LORA_RANK}" \
   --output-dir "${OUTPUT_DIR}"
 echo "==[chaosops]== training metrics:"
 cat "${OUTPUT_DIR}/training_metrics.json" || echo "(no metrics file)"

 GRPO_LOG_EVERY="${GRPO_LOG_EVERY:-1}"
 GRPO_MAX_SEQ_LENGTH="${GRPO_MAX_SEQ_LENGTH:-1024}"
 GRPO_PUSH_TO_HUB="${GRPO_PUSH_TO_HUB:-0}"
+GRPO_LR="${GRPO_LR:-5e-6}"
+GRPO_TEMP="${GRPO_TEMP:-0.7}"
+GRPO_CURRICULUM="${GRPO_CURRICULUM:-}"
 HUB_REPO_ID="${HUB_REPO_ID:-helloAK96/chaosops-grpo-lora}"
 OUTPUT_DIR="/workspace/artifacts/chaosops-grpo"
 GRPO_BACKEND="${GRPO_BACKEND:-transformers}"
+echo "==[chaosops]== launching GRPO (backend=$GRPO_BACKEND, $GRPO_EPISODES episodes, group=$GRPO_GROUP_SIZE, lora_rank=$GRPO_LORA_RANK, lr=$GRPO_LR, temp=$GRPO_TEMP, curriculum=${GRPO_CURRICULUM:-(none)})"
+PY_ARGS=(
+  --model-name "${GRPO_MODEL}"
+  --backend "${GRPO_BACKEND}"
+  --total-episodes "${GRPO_EPISODES}"
+  --group-size "${GRPO_GROUP_SIZE}"
+  --log-every "${GRPO_LOG_EVERY}"
+  --max-seq-length "${GRPO_MAX_SEQ_LENGTH}"
+  --lora-rank "${GRPO_LORA_RANK}"
   --output-dir "${OUTPUT_DIR}"
+  --learning-rate "${GRPO_LR}"
+  --temperature "${GRPO_TEMP}"
+)
+if [ -n "${GRPO_CURRICULUM}" ]; then
+    PY_ARGS+=(--curriculum-schedule "${GRPO_CURRICULUM}")
+fi
+python -m chaosops.train.grpo_train "${PY_ARGS[@]}"
 echo "==[chaosops]== training metrics:"
 cat "${OUTPUT_DIR}/training_metrics.json" || echo "(no metrics file)"

train/grpo_train.py CHANGED Viewed

@@ -567,6 +567,48 @@ def _collect_scenarios(curriculum: Curriculum, *, total: int) -> list[Scenario]:
     return scenarios[:total]
 # ---------------------------------------------------------------------------
 # Training loop — modern TRL GRPO API
 # ---------------------------------------------------------------------------
@@ -585,6 +627,8 @@ def run_grpo(
     max_seq_length: int = 1024,
     max_completion_length: int = 96,
     learning_rate: float = 5e-6,
 ) -> dict[str, Any]:
     """Run GRPO training via TRL's GRPOTrainer.
@@ -597,7 +641,16 @@ def run_grpo(
     output_dir.mkdir(parents=True, exist_ok=True)
     scenario_count = max(total_episodes, 8)
-    scenarios = _collect_scenarios(curriculum, total=scenario_count)
     dataset = build_training_dataset(scenarios)
     # Every optim step: 1 unique prompt × group_size completions.
@@ -608,7 +661,7 @@ def run_grpo(
         per_device_train_batch_size=per_device_train_batch_size,
         gradient_accumulation_steps=1,
         num_generations=group_size,
-        temperature=0.7,
         max_prompt_length=max_seq_length,
         max_completion_length=max_completion_length,
         learning_rate=learning_rate,
@@ -710,6 +763,27 @@ def _parse_args() -> argparse.Namespace:
         choices=["auto", "unsloth", "transformers"],
         help="Model loader. 'auto' tries Unsloth, falls back to transformers.",
     )
     return parser.parse_args()
@@ -732,6 +806,9 @@ def main() -> None:
         log_every=args.log_every,
         output_dir=args.output_dir,
         max_seq_length=args.max_seq_length,
     )
     print(json.dumps(summary, indent=2))

     return scenarios[:total]
+def _scenarios_from_schedule(schedule: str, *, total: int) -> list[Scenario]:
+    """Build a curriculum dataset from a step-budget schedule.
+    Format: ``"easy:200,medium:200,hard:200"`` — generates 200 EASY then 200
+    MEDIUM then 200 HARD scenarios so TRL's GRPOTrainer (which iterates the
+    dataset in order under ``shuffle=False`` semantics for max_steps) sees
+    increasing difficulty over training.
+    If the schedule's total < ``total``, the last tier is padded by cycling
+    its failure types until ``total`` is reached.
+    """
+    parsed: list[tuple[DifficultyTier, int]] = []
+    for chunk in schedule.split(","):
+        tier_name, _, count = chunk.partition(":")
+        tier = DifficultyTier(tier_name.strip().lower())
+        parsed.append((tier, int(count.strip())))
+    scenarios: list[Scenario] = []
+    for tier, count in parsed:
+        cycle_seed = 0
+        tier_scenarios: list[Scenario] = []
+        while len(tier_scenarios) < count:
+            batch = scenarios_for_tier(
+                tier, seed_offset=cycle_seed, episodes_per_type=1
+            )
+            tier_scenarios.extend(batch)
+            cycle_seed += 97
+        scenarios.extend(tier_scenarios[:count])
+    # Pad with the last tier if the schedule under-shoots ``total``.
+    if scenarios and len(scenarios) < total:
+        last_tier = parsed[-1][0]
+        cycle_seed = 9000  # offset past the schedule's seeds
+        while len(scenarios) < total:
+            batch = scenarios_for_tier(
+                last_tier, seed_offset=cycle_seed, episodes_per_type=1
+            )
+            scenarios.extend(batch)
+            cycle_seed += 97
+    return scenarios[:total]
 # ---------------------------------------------------------------------------
 # Training loop — modern TRL GRPO API
 # ---------------------------------------------------------------------------
     max_seq_length: int = 1024,
     max_completion_length: int = 96,
     learning_rate: float = 5e-6,
+    temperature: float = 0.7,
+    curriculum_schedule: str | None = None,
 ) -> dict[str, Any]:
     """Run GRPO training via TRL's GRPOTrainer.
     output_dir.mkdir(parents=True, exist_ok=True)
     scenario_count = max(total_episodes, 8)
+    if curriculum_schedule:
+        scenarios = _scenarios_from_schedule(
+            curriculum_schedule, total=scenario_count
+        )
+        print(
+            f"[grpo_train] curriculum schedule active: {curriculum_schedule} "
+            f"({len(scenarios)} scenarios across tiers)"
+        )
+    else:
+        scenarios = _collect_scenarios(curriculum, total=scenario_count)
     dataset = build_training_dataset(scenarios)
     # Every optim step: 1 unique prompt × group_size completions.
         per_device_train_batch_size=per_device_train_batch_size,
         gradient_accumulation_steps=1,
         num_generations=group_size,
+        temperature=temperature,
         max_prompt_length=max_seq_length,
         max_completion_length=max_completion_length,
         learning_rate=learning_rate,
         choices=["auto", "unsloth", "transformers"],
         help="Model loader. 'auto' tries Unsloth, falls back to transformers.",
     )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=5e-6,
+        help="GRPO learning rate. Default 5e-6; 2e-5 if reward stays flat.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="Sampling temperature for completions during GRPO rollout.",
+    )
+    parser.add_argument(
+        "--curriculum-schedule",
+        type=str,
+        default=None,
+        help=(
+            "Step-budget tier schedule, e.g. 'easy:200,medium:200,hard:200'. "
+            "Overrides --start-tier when set."
+        ),
+    )
     return parser.parse_args()
         log_every=args.log_every,
         output_dir=args.output_dir,
         max_seq_length=args.max_seq_length,
+        learning_rate=args.learning_rate,
+        temperature=args.temperature,
+        curriculum_schedule=args.curriculum_schedule,
     )
     print(json.dumps(summary, indent=2))