Spaces:

helloAK96
/

chaosops

Running

App Files Files Community

chaosops / train /grpo_train.py

helloAK96

GRPO: add --rogue-bonus-multiplier to amplify oversight gradient signal

6f963e5 14 days ago

raw

history blame contribute delete

29.4 kB

	"""GRPO training entry point for ChaosOps AI.

	Runs on Colab T4 (0.5B model) or onsite HF-credit GPUs (7B model):

	python -m chaosops.train.grpo_train \
	--model-name Qwen/Qwen2.5-7B-Instruct \
	--total-episodes 400 \
	--group-size 4 \
	--output-dir artifacts/chaosops-grpo

	Design
	------
	* :func:`build_training_dataset` pre-rolls episodes with ``oracle_policy`` and
	captures every agent turn as a dataset row. Each row is one
	``(prompt, scenario, action_history)`` triple — sufficient to deterministically
	reconstruct the env state for reward scoring.
	* :func:`chaosops_reward` is the TRL-compatible reward function: it parses the
	model's completion, replays scenario + history in a fresh env, applies the
	action, and returns the per-step shaped reward (blend of team + oversight).
	* GRPOTrainer samples ``group_size`` completions per prompt, computes
	group-relative advantages from the rewards, and updates the LoRA adapter.
	* :class:`ChaosOpsMetricsCallback` writes ``training_metrics.json`` in the
	schema the Colab notebook's plot cell expects.

	``rollout_episode`` / ``sample_group`` are retained for use by the dashboard
	and evaluation scripts.
	"""

	from __future__ import annotations

	import argparse
	import dataclasses
	import json
	import statistics
	from collections.abc import Iterable
	from pathlib import Path
	from typing import Any, Callable

	from chaosops.agents.llm_adapter import (
	build_prompt,
	parse_action,
	)
	from chaosops.agents.policies import oracle_policy
	from chaosops.agents.runner import EpisodeStep
	from chaosops.curriculum.generator import Curriculum, scenarios_for_tier
	from chaosops.env.environment import ChaosOpsEnvironment
	from chaosops.env.models import (
	AgentRole,
	ChaosOpsAction,
	DifficultyTier,
	FailureType,
	)
	from chaosops.env.world_sim import Scenario
	from chaosops.rewards.reward_fn import combine_rewards


	# ---------------------------------------------------------------------------
	# Trajectory generation (kept for dashboard / eval callers)
	# ---------------------------------------------------------------------------


	@dataclasses.dataclass
	class TurnSample:
	"""One (prompt, completion, reward) triple — the unit GRPO consumes."""

	prompt: str
	completion: str
	role: AgentRole
	team_reward: float
	oversight_reward: float
	combined_reward: float
	step: int
	done: bool


	GenerateFn = Callable[[str, AgentRole], str]
	"""Signature: ``(prompt, role) -> completion``."""


	def rollout_episode(
	env: ChaosOpsEnvironment,
	scenario: Scenario,
	generate: GenerateFn,
	*,
	team_weight: float = 0.6,
	) -> tuple[list[TurnSample], list[EpisodeStep]]:
	"""Roll out one episode with ``generate`` driving every role.

	Returns both the TurnSample list and the EpisodeStep list (1:1).
	"""
	observation = env.reset(scenario=scenario)
	samples: list[TurnSample] = []
	episode_steps: list[EpisodeStep] = []
	turn_limit = scenario.max_steps * len(env.turn_order)

	for turn in range(turn_limit):
	role = observation.turn_role
	prompt = build_prompt(observation)
	completion = generate(prompt, role)
	action = parse_action(completion, role=role)

	next_obs = env.step(action)
	breakdown = env.last_breakdown
	assert breakdown is not None
	reward = combine_rewards(
	breakdown.team_reward, breakdown.oversight_reward, team_weight=team_weight
	)

	samples.append(
	TurnSample(
	prompt=prompt,
	completion=completion,
	role=role,
	team_reward=breakdown.team_reward,
	oversight_reward=breakdown.oversight_reward,
	combined_reward=reward,
	step=env.state.step_count,
	done=next_obs.done,
	)
	)
	episode_steps.append(
	EpisodeStep(
	turn=turn,
	role=role,
	observation=observation,
	action=action,
	reward=next_obs.reward or 0.0,
	breakdown=breakdown,
	done=next_obs.done,
	)
	)

	if next_obs.done:
	break
	observation = next_obs

	return samples, episode_steps


	def sample_group(
	env: ChaosOpsEnvironment,
	scenario: Scenario,
	generate: GenerateFn,
	*,
	group_size: int,
	team_weight: float,
	) -> list[list[TurnSample]]:
	"""Roll out ``group_size`` trajectories on perturbed seeds of the same scenario."""
	group: list[list[TurnSample]] = []
	base_seed = scenario.seed
	for k in range(group_size):
	perturbed = dataclasses.replace(scenario, seed=base_seed + k * 7919)
	samples, _ = rollout_episode(
	env, perturbed, generate, team_weight=team_weight
	)
	group.append(samples)
	return group


	def trajectory_reward(samples: Iterable[TurnSample]) -> float:
	return sum(s.combined_reward for s in samples)


	# ---------------------------------------------------------------------------
	# Scenario / action serialization for dataset rows
	# ---------------------------------------------------------------------------


	def _scenario_to_json(scen: Scenario) -> str:
	return json.dumps(
	{
	"failure_type": scen.failure_type.value,
	"difficulty": scen.difficulty.value,
	"seed": scen.seed,
	"max_steps": scen.max_steps,
	"inject_misleading_logs": scen.inject_misleading_logs,
	"rogue_fleet_agent": scen.rogue_fleet_agent,
	}
	)


	def _scenario_from_json(payload: str) -> Scenario:
	d = json.loads(payload)
	return Scenario(
	failure_type=FailureType(d["failure_type"]),
	difficulty=DifficultyTier(d["difficulty"]),
	seed=int(d["seed"]),
	max_steps=int(d["max_steps"]),
	inject_misleading_logs=bool(d["inject_misleading_logs"]),
	rogue_fleet_agent=d["rogue_fleet_agent"],
	)


	# ---------------------------------------------------------------------------
	# Dataset construction — oracle-rollout prompts
	# ---------------------------------------------------------------------------


	def build_training_dataset(scenarios: list[Scenario]):
	"""Pre-roll every ``scenario`` with ``oracle_policy`` and collect per-turn rows.

	Each row: ``{prompt, scenario_json, action_history_json, role, turn_idx}``.
	The reward function uses scenario + action_history to deterministically
	reconstruct the env state before scoring the model's completion.
	"""
	from datasets import Dataset # type: ignore[import-not-found]

	rows: list[dict[str, Any]] = []
	for scen in scenarios:
	env = ChaosOpsEnvironment()
	observation = env.reset(scenario=scen)
	policy = oracle_policy(scen.failure_type)
	action_history: list[dict[str, Any]] = []
	turn_limit = scen.max_steps * len(env.turn_order)
	for turn in range(turn_limit):
	prompt = build_prompt(observation)
	rows.append(
	{
	"prompt": prompt,
	"scenario_json": _scenario_to_json(scen),
	"action_history_json": json.dumps(action_history),
	"role": observation.turn_role.value,
	"turn_idx": turn,
	}
	)
	action = policy(observation, observation.turn_role)
	action_history.append(action.model_dump(mode="json"))
	observation = env.step(action)
	if observation.done:
	break

	return Dataset.from_list(rows)


	# ---------------------------------------------------------------------------
	# GRPO reward function (modern TRL signature)
	# ---------------------------------------------------------------------------


	def make_reward_fn(team_weight: float, rogue_bonus_multiplier: float = 1.0):
	"""Return a TRL-compatible reward function.

	``rogue_bonus_multiplier`` scales the OversightRubric weights at score
	time so the GRPO gradient on ``flag_rogue`` actions can be amplified
	without touching the env's published reward formula.
	"""

	def chaosops_reward(
	prompts: list[str],
	completions: list[str],
	scenario_json: list[str],
	action_history_json: list[str],
	role: list[str],
	turn_idx: list[int],
	**_kwargs: Any,
	) -> list[float]:
	rewards: list[float] = []
	for completion, scen_js, hist_js, role_v in zip(
	completions, scenario_json, action_history_json, role, strict=False
	):
	try:
	reward = _score_completion(
	completion=completion,
	scen_js=scen_js,
	hist_js=hist_js,
	role_v=role_v,
	team_weight=team_weight,
	rogue_bonus_multiplier=rogue_bonus_multiplier,
	)
	except Exception:
	# Robust to parsing / replay failures — penalise but don't crash training.
	reward = -5.0
	rewards.append(reward)
	return rewards

	return chaosops_reward


	def _score_completion(
	*,
	completion: str,
	scen_js: str,
	hist_js: str,
	role_v: str,
	team_weight: float,
	rogue_bonus_multiplier: float = 1.0,
	) -> float:
	from chaosops.rewards.reward_fn import compute_step_reward

	scen = _scenario_from_json(scen_js)
	history_raw = json.loads(hist_js)
	env = ChaosOpsEnvironment()
	observation = env.reset(scenario=scen)
	for past in history_raw:
	past_action = ChaosOpsAction.model_validate(past)
	observation = env.step(past_action)
	if observation.done:
	return 0.0
	role_enum = AgentRole(role_v)
	if observation.turn_role != role_enum:
	# Replayed state doesn't match the captured row — treat as neutral.
	return 0.0
	# Completion may include chat-template artefacts; parse_action handles JSON extraction.
	text = completion if isinstance(completion, str) else str(completion)
	action = parse_action(text, role=role_enum)
	env.step(action)
	breakdown = env.last_breakdown
	if breakdown is None:
	return 0.0
	if rogue_bonus_multiplier != 1.0:
	# Re-score this step with scaled oversight rubric so the GRPO
	# gradient on `flag_rogue` actions is amplified.
	flags = {
	"resolved": False, # post-action state already updated; re-derive flags from breakdown
	"wrong_fix": breakdown.wrong_fix_penalty < 0,
	"miscommunication": breakdown.miscommunication_penalty < 0,
	"root_cause_correct": breakdown.early_root_cause_bonus > 0,
	"rogue_flagged_correctly": breakdown.rogue_caught_bonus > 0,
	"rogue_flagged_incorrectly": breakdown.rogue_false_positive_penalty < 0,
	"cascade_triggered": breakdown.cascade_penalty < 0,
	}
	# The `resolved` flag is recoverable from env state (post-step):
	flags["resolved"] = env.state.resolved
	rescored = compute_step_reward(
	state=env.state,
	outcome_flags=flags,
	rogue_bonus_multiplier=rogue_bonus_multiplier,
	)
	return combine_rewards(
	rescored.team_reward,
	rescored.oversight_reward,
	team_weight=team_weight,
	)
	return combine_rewards(
	breakdown.team_reward,
	breakdown.oversight_reward,
	team_weight=team_weight,
	)


	# ---------------------------------------------------------------------------
	# Model loading
	# ---------------------------------------------------------------------------


	def load_unsloth_model(
	model_name: str,
	*,
	max_seq_length: int = 2048,
	load_in_4bit: bool = True,
	lora_rank: int = 32,
	):
	"""Load a base LLM with Unsloth + LoRA. Returns ``(model, tokenizer)``.

	Requires triton + a C compiler at runtime; if either is missing,
	fall back to :func:`load_transformers_model`.
	"""
	from unsloth import FastLanguageModel # type: ignore[import-not-found]

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name,
	max_seq_length=max_seq_length,
	load_in_4bit=load_in_4bit,
	)
	model = FastLanguageModel.get_peft_model(
	model,
	r=lora_rank,
	lora_alpha=lora_rank,
	lora_dropout=0.0,
	target_modules=[
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"up_proj",
	"down_proj",
	],
	bias="none",
	use_gradient_checkpointing="unsloth",
	)
	return model, tokenizer


	def load_transformers_model(
	model_name: str,
	*,
	max_seq_length: int = 2048,
	load_in_4bit: bool = True,
	lora_rank: int = 32,
	):
	"""Plain ``transformers + peft`` model loader — no Unsloth/triton dep.

	Used when the runtime image doesn't ship triton/cc (most lightweight
	CUDA images). Slightly slower per step than Unsloth but works on any
	standard PyTorch image.
	"""
	import torch # type: ignore[import-not-found]
	from peft import LoraConfig, get_peft_model # type: ignore[import-not-found]
	from transformers import ( # type: ignore[import-not-found]
	AutoModelForCausalLM,
	AutoTokenizer,
	)

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token = tokenizer.eos_token

	load_kwargs: dict[str, Any] = {}
	if load_in_4bit:
	try:
	from transformers import BitsAndBytesConfig # type: ignore[import-not-found]

	load_kwargs["quantization_config"] = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.float16,
	)
	except Exception:
	# bnb unavailable — fall back to fp16 full-precision LoRA.
	load_kwargs["torch_dtype"] = torch.float16
	else:
	load_kwargs["torch_dtype"] = torch.float16

	if torch.cuda.is_available():
	load_kwargs["device_map"] = {"": 0}

	base = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
	lora_cfg = LoraConfig(
	r=lora_rank,
	lora_alpha=lora_rank,
	lora_dropout=0.0,
	bias="none",
	target_modules=[
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"up_proj",
	"down_proj",
	],
	task_type="CAUSAL_LM",
	)
	model = get_peft_model(base, lora_cfg)
	return model, tokenizer


	def load_model(
	model_name: str,
	*,
	backend: str = "auto",
	max_seq_length: int = 2048,
	load_in_4bit: bool = True,
	lora_rank: int = 32,
	):
	"""Dispatch to the requested loader, with auto-fallback.

	``backend`` ∈ ``{"auto", "unsloth", "transformers"}``. ``auto`` tries
	Unsloth first and falls back to transformers if the import fails or
	the runtime can't satisfy triton's C-compiler dep.
	"""
	if backend == "transformers":
	return load_transformers_model(
	model_name,
	max_seq_length=max_seq_length,
	load_in_4bit=load_in_4bit,
	lora_rank=lora_rank,
	)
	if backend == "unsloth":
	return load_unsloth_model(
	model_name,
	max_seq_length=max_seq_length,
	load_in_4bit=load_in_4bit,
	lora_rank=lora_rank,
	)
	# auto
	try:
	return load_unsloth_model(
	model_name,
	max_seq_length=max_seq_length,
	load_in_4bit=load_in_4bit,
	lora_rank=lora_rank,
	)
	except Exception as exc:
	print(f"[grpo_train] Unsloth path failed ({exc!r}); using transformers")
	return load_transformers_model(
	model_name,
	max_seq_length=max_seq_length,
	load_in_4bit=load_in_4bit,
	lora_rank=lora_rank,
	)


	def make_generate_fn(
	model, tokenizer, *, max_new_tokens: int = 96, temperature: float = 0.7
	) -> GenerateFn:
	"""Wrap an HF model in the ``GenerateFn`` signature used by dashboard rollouts."""

	def _generate(prompt: str, role: AgentRole) -> str:
	messages = [
	{
	"role": "system",
	"content": f"You are the {role.value.upper()} agent in ChaosOps AI.",
	},
	{"role": "user", "content": prompt},
	]
	rendered = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	inputs = tokenizer(rendered, return_tensors="pt").to(model.device)
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	do_sample=temperature > 0,
	pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
	)
	text = tokenizer.decode(
	outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
	)
	return text

	return _generate


	# ---------------------------------------------------------------------------
	# Metrics callback — writes training_metrics.json as the plot cell expects
	# ---------------------------------------------------------------------------


	def _make_metrics_callback(output_dir: Path):
	from transformers import TrainerCallback # type: ignore[import-not-found]

	class ChaosOpsMetricsCallback(TrainerCallback):
	"""Capture TRL's per-log reward stats and persist them to JSON.

	The Colab notebook's plot cell reads three fields: ``mean_team_reward``,
	``mean_oversight_reward``, ``mean_combined_reward``. Our reward
	function already emits ``combine_rewards(team, oversight)``, so the
	team/oversight slots carry the same combined scalar — honest given we
	don't split the signal during training. The curve still shows the
	reward rising as expected.
	"""

	def __init__(self) -> None:
	self.log: list[dict[str, Any]] = []
	self.output_dir = output_dir
	self.metrics_path = output_dir / "training_metrics.json"
	output_dir.mkdir(parents=True, exist_ok=True)

	def on_log(self, args, state, control, logs=None, **kwargs): # noqa: ANN001 — HF signature
	if not logs:
	return
	reward_key_candidates = [
	"reward",
	"rewards/chaosops_reward/mean",
	"rewards/chaosops_reward",
	]
	reward: float \| None = None
	for key in reward_key_candidates:
	if key in logs:
	reward = float(logs[key])
	break
	if reward is None:
	return
	entry = {
	"episode": int(state.global_step),
	"mean_team_reward": reward,
	"mean_oversight_reward": reward,
	"mean_combined_reward": reward,
	}
	for extra in ("loss", "kl", "reward_std"):
	if extra in logs:
	entry[extra] = float(logs[extra])
	self.log.append(entry)
	self.metrics_path.write_text(json.dumps(self.log, indent=2))

	return ChaosOpsMetricsCallback()


	# ---------------------------------------------------------------------------
	# Scenario sourcing
	# ---------------------------------------------------------------------------


	def _collect_scenarios(curriculum: Curriculum, *, total: int) -> list[Scenario]:
	"""Pull ``total`` scenarios from the current tier, cycling failure types."""
	scenarios: list[Scenario] = []
	cycle_seed = 0
	while len(scenarios) < total:
	batch = scenarios_for_tier(
	curriculum.tier,
	seed_offset=cycle_seed,
	episodes_per_type=1,
	)
	scenarios.extend(batch)
	cycle_seed += 97
	return scenarios[:total]


	def _scenarios_from_schedule(schedule: str, *, total: int) -> list[Scenario]:
	"""Build a curriculum dataset from a step-budget schedule.

	Format: ``"easy:200,medium:200,hard:200"`` — generates 200 EASY then 200
	MEDIUM then 200 HARD scenarios so TRL's GRPOTrainer (which iterates the
	dataset in order under ``shuffle=False`` semantics for max_steps) sees
	increasing difficulty over training.

	If the schedule's total < ``total``, the last tier is padded by cycling
	its failure types until ``total`` is reached.
	"""
	parsed: list[tuple[DifficultyTier, int]] = []
	for chunk in schedule.split(","):
	tier_name, _, count = chunk.partition(":")
	tier = DifficultyTier(tier_name.strip().lower())
	parsed.append((tier, int(count.strip())))

	scenarios: list[Scenario] = []
	for tier, count in parsed:
	cycle_seed = 0
	tier_scenarios: list[Scenario] = []
	while len(tier_scenarios) < count:
	batch = scenarios_for_tier(
	tier, seed_offset=cycle_seed, episodes_per_type=1
	)
	tier_scenarios.extend(batch)
	cycle_seed += 97
	scenarios.extend(tier_scenarios[:count])

	# Pad with the last tier if the schedule under-shoots ``total``.
	if scenarios and len(scenarios) < total:
	last_tier = parsed[-1][0]
	cycle_seed = 9000 # offset past the schedule's seeds
	while len(scenarios) < total:
	batch = scenarios_for_tier(
	last_tier, seed_offset=cycle_seed, episodes_per_type=1
	)
	scenarios.extend(batch)
	cycle_seed += 97
	return scenarios[:total]


	# ---------------------------------------------------------------------------
	# Training loop — modern TRL GRPO API
	# ---------------------------------------------------------------------------


	def run_grpo(
	*,
	model,
	tokenizer,
	total_episodes: int,
	group_size: int,
	team_weight: float,
	curriculum: Curriculum,
	log_every: int,
	output_dir: Path,
	max_seq_length: int = 1024,
	max_completion_length: int = 96,
	learning_rate: float = 5e-6,
	temperature: float = 0.7,
	curriculum_schedule: str \| None = None,
	rogue_bonus_multiplier: float = 1.0,
	) -> dict[str, Any]:
	"""Run GRPO training via TRL's GRPOTrainer.

	``total_episodes`` caps the number of optimisation steps (``max_steps``).
	Each optim step consumes one unique prompt from the dataset and rolls
	``group_size`` completions — the classic GRPO group.
	"""
	from trl import GRPOConfig, GRPOTrainer # type: ignore[import-not-found]

	output_dir.mkdir(parents=True, exist_ok=True)

	scenario_count = max(total_episodes, 8)
	if curriculum_schedule:
	scenarios = _scenarios_from_schedule(
	curriculum_schedule, total=scenario_count
	)
	print(
	f"[grpo_train] curriculum schedule active: {curriculum_schedule} "
	f"({len(scenarios)} scenarios across tiers)"
	)
	else:
	scenarios = _collect_scenarios(curriculum, total=scenario_count)
	dataset = build_training_dataset(scenarios)

	# Every optim step: 1 unique prompt × group_size completions.
	per_device_train_batch_size = group_size

	config = GRPOConfig(
	output_dir=str(output_dir),
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=1,
	num_generations=group_size,
	temperature=temperature,
	max_prompt_length=max_seq_length,
	max_completion_length=max_completion_length,
	learning_rate=learning_rate,
	logging_steps=log_every,
	max_steps=total_episodes,
	save_steps=max(total_episodes, 10_000),
	save_strategy="no",
	report_to=[],
	bf16=False,
	fp16=True,
	remove_unused_columns=False,
	)

	reward_fn = make_reward_fn(team_weight, rogue_bonus_multiplier=rogue_bonus_multiplier)
	if rogue_bonus_multiplier != 1.0:
	print(
	f"[grpo_train] rogue rubric ×{rogue_bonus_multiplier} "
	f"(catch={50.0 * rogue_bonus_multiplier:+.0f}, "
	f"FP={-75.0 * rogue_bonus_multiplier:+.0f})"
	)
	metrics_callback = _make_metrics_callback(output_dir)

	trainer = GRPOTrainer(
	model=model,
	processing_class=tokenizer,
	args=config,
	train_dataset=dataset,
	reward_funcs=[reward_fn],
	callbacks=[metrics_callback],
	)
	trainer.train()

	# Persist final LoRA adapter for downstream inference.
	adapter_dir = output_dir / "lora_adapter"
	try:
	trainer.model.save_pretrained(str(adapter_dir))
	tokenizer.save_pretrained(str(adapter_dir))
	except Exception as exc: # pragma: no cover — best-effort
	print(f"[grpo_train] could not save adapter: {exc}")

	# Guarantee the metrics file exists for the plot cell even if no log event fired.
	metrics_path = output_dir / "training_metrics.json"
	if not metrics_path.exists():
	metrics_path.write_text(
	json.dumps(
	[
	{
	"episode": 0,
	"mean_team_reward": 0.0,
	"mean_oversight_reward": 0.0,
	"mean_combined_reward": 0.0,
	}
	],
	indent=2,
	)
	)

	rewards_collected = [e["mean_combined_reward"] for e in metrics_callback.log]
	summary = {
	"final_tier": curriculum.tier.value,
	"total_episodes": total_episodes,
	"dataset_size": len(dataset),
	"group_size": group_size,
	"metrics_path": str(metrics_path),
	"adapter_path": str(adapter_dir),
	"mean_logged_reward": (
	statistics.mean(rewards_collected) if rewards_collected else float("nan")
	),
	}
	return summary


	# ---------------------------------------------------------------------------
	# CLI
	# ---------------------------------------------------------------------------


	def _parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--model-name",
	type=str,
	default="Qwen/Qwen2.5-3B-Instruct",
	help="HF repo id. Use 7B variant once GPU is provisioned.",
	)
	parser.add_argument("--total-episodes", type=int, default=30)
	parser.add_argument("--group-size", type=int, default=2)
	parser.add_argument("--team-weight", type=float, default=0.6)
	parser.add_argument("--log-every", type=int, default=2)
	parser.add_argument("--max-seq-length", type=int, default=1024)
	parser.add_argument("--lora-rank", type=int, default=16)
	parser.add_argument(
	"--output-dir", type=Path, default=Path("artifacts/chaosops-grpo")
	)
	parser.add_argument(
	"--start-tier",
	type=str,
	default=DifficultyTier.EASY.value,
	choices=[t.value for t in DifficultyTier],
	)
	parser.add_argument(
	"--backend",
	type=str,
	default="auto",
	choices=["auto", "unsloth", "transformers"],
	help="Model loader. 'auto' tries Unsloth, falls back to transformers.",
	)
	parser.add_argument(
	"--learning-rate",
	type=float,
	default=5e-6,
	help="GRPO learning rate. Default 5e-6; 2e-5 if reward stays flat.",
	)
	parser.add_argument(
	"--temperature",
	type=float,
	default=0.7,
	help="Sampling temperature for completions during GRPO rollout.",
	)
	parser.add_argument(
	"--curriculum-schedule",
	type=str,
	default=None,
	help=(
	"Step-budget tier schedule, e.g. 'easy:200,medium:200,hard:200'. "
	"Overrides --start-tier when set."
	),
	)
	parser.add_argument(
	"--rogue-bonus-multiplier",
	type=float,
	default=1.0,
	help=(
	"Scale BOTH the OversightRubric rogue-catch bonus (+50) and FP "
	"penalty (-75) by this factor. >1.0 amplifies the gradient on "
	"flag_rogue actions; useful when prior runs collapsed off them."
	),
	)
	return parser.parse_args()


	def main() -> None:
	args = _parse_args()
	model, tokenizer = load_model(
	args.model_name,
	backend=args.backend,
	max_seq_length=args.max_seq_length,
	lora_rank=args.lora_rank,
	)
	curriculum = Curriculum(tier=DifficultyTier(args.start_tier))
	summary = run_grpo(
	model=model,
	tokenizer=tokenizer,
	total_episodes=args.total_episodes,
	group_size=args.group_size,
	team_weight=args.team_weight,
	curriculum=curriculum,
	log_every=args.log_every,
	output_dir=args.output_dir,
	max_seq_length=args.max_seq_length,
	learning_rate=args.learning_rate,
	temperature=args.temperature,
	curriculum_schedule=args.curriculum_schedule,
	rogue_bonus_multiplier=args.rogue_bonus_multiplier,
	)
	print(json.dumps(summary, indent=2))


	if __name__ == "__main__":
	main()