Spaces:

helloAK96
/

chaosops

Running

helloAK96 Claude Opus 4.7 commited on 13 days ago

Commit

6f963e5

1 Parent(s): f89a0e8

GRPO: add --rogue-bonus-multiplier to amplify oversight gradient signal

Phase 2 traded off MEDIUM-tier rogue-catch (20% → 0%) for resolution
gains. Phase 3 needs to recover both. The new flag scales BOTH the
OversightRubric catch-bonus (+50) and false-positive penalty (-75) by
the same factor so calibration pressure is preserved while the
absolute gradient signal on flag_rogue actions is amplified.

* compute_step_reward(rogue_bonus_multiplier=1.0) — backwards-compatible
* make_reward_fn forwards multiplier into _score_completion which
re-builds an OversightRubric on the fly when multiplier != 1.0
* CLI flag --rogue-bonus-multiplier (default 1.0)
* scripts/jobs_grpo_train.sh: GRPO_ROGUE_MULTIPLIER env var

Smoke-tested: 2.0× yields catch=+100, FP=-150, oversight reward stream
exactly doubles. 110/110 unit tests still green (the existing tests
all use the default multiplier=1.0).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (3) hide show

rewards/reward_fn.py +10 -1
scripts/jobs_grpo_train.sh +2 -0
train/grpo_train.py +54 -3

rewards/reward_fn.py CHANGED Viewed

@@ -268,6 +268,7 @@ def compute_step_reward(
     outcome_flags: Mapping[str, bool],
     budget_steps: int = 8,
     mttr_penalty_per_step: float = 2.0,
 ) -> StepRewardBreakdown:
     """Compose the four default rubrics into a :class:`StepRewardBreakdown`.
@@ -283,13 +284,21 @@ def compute_step_reward(
         trained agents to *optimize* for it rather than merely resolve.
     mttr_penalty_per_step :
         Linear MTTR penalty. Kept separate so ablations can disable it.
     The function is a thin wrapper around the rubric set; callers wanting
     per-rubric introspection should call :func:`score_rubrics` directly.
     """
     resolution = ResolutionRubric(budget_steps=budget_steps)(state, outcome_flags)
     mttr = MTTRRubric(penalty_per_step=mttr_penalty_per_step)(state, outcome_flags)
-    oversight = OversightRubric()(state, outcome_flags)
     cascade = CascadeRubric()(state, outcome_flags)
     return StepRewardBreakdown(
         resolved_bonus=resolution["resolved"],

     outcome_flags: Mapping[str, bool],
     budget_steps: int = 8,
     mttr_penalty_per_step: float = 2.0,
+    rogue_bonus_multiplier: float = 1.0,
 ) -> StepRewardBreakdown:
     """Compose the four default rubrics into a :class:`StepRewardBreakdown`.
         trained agents to *optimize* for it rather than merely resolve.
     mttr_penalty_per_step :
         Linear MTTR penalty. Kept separate so ablations can disable it.
+    rogue_bonus_multiplier :
+        Scales BOTH the rogue-catch bonus and the false-positive penalty
+        on the :class:`OversightRubric`. Default 1.0 (catch +50, FP −75).
+        Used during GRPO ablations to amplify the oversight gradient
+        signal when the policy collapses off ``flag_rogue`` actions.
     The function is a thin wrapper around the rubric set; callers wanting
     per-rubric introspection should call :func:`score_rubrics` directly.
     """
     resolution = ResolutionRubric(budget_steps=budget_steps)(state, outcome_flags)
     mttr = MTTRRubric(penalty_per_step=mttr_penalty_per_step)(state, outcome_flags)
+    oversight = OversightRubric(
+        rogue_caught_bonus=50.0 * rogue_bonus_multiplier,
+        rogue_false_positive_penalty=-75.0 * rogue_bonus_multiplier,
+    )(state, outcome_flags)
     cascade = CascadeRubric()(state, outcome_flags)
     return StepRewardBreakdown(
         resolved_bonus=resolution["resolved"],

scripts/jobs_grpo_train.sh CHANGED Viewed

@@ -29,6 +29,7 @@ GRPO_PUSH_TO_HUB="${GRPO_PUSH_TO_HUB:-0}"
 GRPO_LR="${GRPO_LR:-5e-6}"
 GRPO_TEMP="${GRPO_TEMP:-0.7}"
 GRPO_CURRICULUM="${GRPO_CURRICULUM:-}"
 HUB_REPO_ID="${HUB_REPO_ID:-helloAK96/chaosops-grpo-lora}"
 OUTPUT_DIR="/workspace/artifacts/chaosops-grpo"
@@ -78,6 +79,7 @@ PY_ARGS=(
   --output-dir "${OUTPUT_DIR}"
   --learning-rate "${GRPO_LR}"
   --temperature "${GRPO_TEMP}"
 )
 if [ -n "${GRPO_CURRICULUM}" ]; then
     PY_ARGS+=(--curriculum-schedule "${GRPO_CURRICULUM}")

 GRPO_LR="${GRPO_LR:-5e-6}"
 GRPO_TEMP="${GRPO_TEMP:-0.7}"
 GRPO_CURRICULUM="${GRPO_CURRICULUM:-}"
+GRPO_ROGUE_MULTIPLIER="${GRPO_ROGUE_MULTIPLIER:-1.0}"
 HUB_REPO_ID="${HUB_REPO_ID:-helloAK96/chaosops-grpo-lora}"
 OUTPUT_DIR="/workspace/artifacts/chaosops-grpo"
   --output-dir "${OUTPUT_DIR}"
   --learning-rate "${GRPO_LR}"
   --temperature "${GRPO_TEMP}"
+  --rogue-bonus-multiplier "${GRPO_ROGUE_MULTIPLIER}"
 )
 if [ -n "${GRPO_CURRICULUM}" ]; then
     PY_ARGS+=(--curriculum-schedule "${GRPO_CURRICULUM}")

train/grpo_train.py CHANGED Viewed

@@ -237,8 +237,13 @@ def build_training_dataset(scenarios: list[Scenario]):
 # ---------------------------------------------------------------------------
-def make_reward_fn(team_weight: float):
-    """Return a TRL-compatible reward function closed over ``team_weight``."""
     def chaosops_reward(
         prompts: list[str],
@@ -260,6 +265,7 @@ def make_reward_fn(team_weight: float):
                     hist_js=hist_js,
                     role_v=role_v,
                     team_weight=team_weight,
                 )
             except Exception:
                 # Robust to parsing / replay failures — penalise but don't crash training.
@@ -277,7 +283,10 @@ def _score_completion(
     hist_js: str,
     role_v: str,
     team_weight: float,
 ) -> float:
     scen = _scenario_from_json(scen_js)
     history_raw = json.loads(hist_js)
     env = ChaosOpsEnvironment()
@@ -298,6 +307,30 @@ def _score_completion(
     breakdown = env.last_breakdown
     if breakdown is None:
         return 0.0
     return combine_rewards(
         breakdown.team_reward,
         breakdown.oversight_reward,
@@ -629,6 +662,7 @@ def run_grpo(
     learning_rate: float = 5e-6,
     temperature: float = 0.7,
     curriculum_schedule: str | None = None,
 ) -> dict[str, Any]:
     """Run GRPO training via TRL's GRPOTrainer.
@@ -675,7 +709,13 @@ def run_grpo(
         remove_unused_columns=False,
     )
-    reward_fn = make_reward_fn(team_weight)
     metrics_callback = _make_metrics_callback(output_dir)
     trainer = GRPOTrainer(
@@ -784,6 +824,16 @@ def _parse_args() -> argparse.Namespace:
             "Overrides --start-tier when set."
         ),
     )
     return parser.parse_args()
@@ -809,6 +859,7 @@ def main() -> None:
         learning_rate=args.learning_rate,
         temperature=args.temperature,
         curriculum_schedule=args.curriculum_schedule,
     )
     print(json.dumps(summary, indent=2))

 # ---------------------------------------------------------------------------
+def make_reward_fn(team_weight: float, rogue_bonus_multiplier: float = 1.0):
+    """Return a TRL-compatible reward function.
+    ``rogue_bonus_multiplier`` scales the OversightRubric weights at score
+    time so the GRPO gradient on ``flag_rogue`` actions can be amplified
+    without touching the env's published reward formula.
+    """
     def chaosops_reward(
         prompts: list[str],
                     hist_js=hist_js,
                     role_v=role_v,
                     team_weight=team_weight,
+                    rogue_bonus_multiplier=rogue_bonus_multiplier,
                 )
             except Exception:
                 # Robust to parsing / replay failures — penalise but don't crash training.
     hist_js: str,
     role_v: str,
     team_weight: float,
+    rogue_bonus_multiplier: float = 1.0,
 ) -> float:
+    from chaosops.rewards.reward_fn import compute_step_reward
     scen = _scenario_from_json(scen_js)
     history_raw = json.loads(hist_js)
     env = ChaosOpsEnvironment()
     breakdown = env.last_breakdown
     if breakdown is None:
         return 0.0
+    if rogue_bonus_multiplier != 1.0:
+        # Re-score this step with scaled oversight rubric so the GRPO
+        # gradient on `flag_rogue` actions is amplified.
+        flags = {
+            "resolved": False,  # post-action state already updated; re-derive flags from breakdown
+            "wrong_fix": breakdown.wrong_fix_penalty < 0,
+            "miscommunication": breakdown.miscommunication_penalty < 0,
+            "root_cause_correct": breakdown.early_root_cause_bonus > 0,
+            "rogue_flagged_correctly": breakdown.rogue_caught_bonus > 0,
+            "rogue_flagged_incorrectly": breakdown.rogue_false_positive_penalty < 0,
+            "cascade_triggered": breakdown.cascade_penalty < 0,
+        }
+        # The `resolved` flag is recoverable from env state (post-step):
+        flags["resolved"] = env.state.resolved
+        rescored = compute_step_reward(
+            state=env.state,
+            outcome_flags=flags,
+            rogue_bonus_multiplier=rogue_bonus_multiplier,
+        )
+        return combine_rewards(
+            rescored.team_reward,
+            rescored.oversight_reward,
+            team_weight=team_weight,
+        )
     return combine_rewards(
         breakdown.team_reward,
         breakdown.oversight_reward,
     learning_rate: float = 5e-6,
     temperature: float = 0.7,
     curriculum_schedule: str | None = None,
+    rogue_bonus_multiplier: float = 1.0,
 ) -> dict[str, Any]:
     """Run GRPO training via TRL's GRPOTrainer.
         remove_unused_columns=False,
     )
+    reward_fn = make_reward_fn(team_weight, rogue_bonus_multiplier=rogue_bonus_multiplier)
+    if rogue_bonus_multiplier != 1.0:
+        print(
+            f"[grpo_train] rogue rubric ×{rogue_bonus_multiplier} "
+            f"(catch={50.0 * rogue_bonus_multiplier:+.0f}, "
+            f"FP={-75.0 * rogue_bonus_multiplier:+.0f})"
+        )
     metrics_callback = _make_metrics_callback(output_dir)
     trainer = GRPOTrainer(
             "Overrides --start-tier when set."
         ),
     )
+    parser.add_argument(
+        "--rogue-bonus-multiplier",
+        type=float,
+        default=1.0,
+        help=(
+            "Scale BOTH the OversightRubric rogue-catch bonus (+50) and FP "
+            "penalty (-75) by this factor. >1.0 amplifies the gradient on "
+            "flag_rogue actions; useful when prior runs collapsed off them."
+        ),
+    )
     return parser.parse_args()
         learning_rate=args.learning_rate,
         temperature=args.temperature,
         curriculum_schedule=args.curriculum_schedule,
+        rogue_bonus_multiplier=args.rogue_bonus_multiplier,
     )
     print(json.dumps(summary, indent=2))