Spaces:

Pratyush-01
/

physix-live

Sleeping

App Files Files Community

Pratyush-01 commited on 12 days ago

Commit

b4bd6d8

verified ·

1 Parent(s): b788dab

cleanup: strip verbose comments from physix/training/loop.py

Browse files

Files changed (1) hide show

physix/training/loop.py +18 -255

physix/training/loop.py CHANGED Viewed

@@ -1,21 +1,6 @@
-"""GRPO training loop using Unsloth + TRL + W&B.
-Requires the ``[train]`` optional dependency group. Importing this module on
-a machine without the heavy ML deps installed will fail at module load,
-which is the documented contract — local development tools (env server,
-verifier, demo UI) live in lighter modules and remain usable.
-Run via::
-    python -m physix.training.loop \
-        --model Qwen/Qwen2.5-1.5B-Instruct \
-        --output-dir runs/physix-1.5b-rl \
-        --num-steps 300
-Environment variables:
-- ``WANDB_PROJECT`` (default ``physix-live``)
-- ``HUGGINGFACE_HUB_TOKEN`` if pushing the adapter to the Hub
 """
 from __future__ import annotations
@@ -40,17 +25,8 @@ from physix.training.dataset import (
 from physix.training.reward_fns import make_reward_funcs
 from physix.training.scorer import Scorer
-# IMPORTANT: Unsloth's GRPO patches must be applied *before* importing
-# ``GRPOTrainer`` so its kernels are swapped in. Without this, the trainer
-# falls back to the stock TRL path and Unsloth's optimisations are bypassed
-# (and on recent versions the import will hard-fail). Keep this block
-# directly above the ``trl`` import — order matters.
-#
-# Version note: this requires ``trl<=0.24.0``. Newer TRL versions ship
-# ``trl.experimental.openenv`` which Unsloth's ``patch_trl_openenv``
-# hook tries to ``inspect.getsource()`` on; that fails with ``OSError:
-# could not get source code`` and crashes ``PatchFastRL``. ``trl==0.24.0``
-# is the pinned upper bound declared in unsloth's pyproject.toml.
 from unsloth import FastLanguageModel, PatchFastRL  # noqa: E402
 PatchFastRL("GRPO", FastLanguageModel)
@@ -71,19 +47,9 @@ class TrainingConfig(BaseModel):
     model_config = ConfigDict(frozen=True)
     model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"
-    #: Optional path to a LoRA adapter produced by the SFT warm-start step.
-    #: When set, the base model is loaded and the adapter weights are applied
-    #: before GRPO begins. Without this the cold base model rarely produces
-    #: any reward signal in early steps.
     sft_checkpoint: Optional[str] = None
-    #: Optional Hub repo id (or local path) of an existing LoRA adapter to
-    #: warm-start GRPO from — e.g. a previous GRPO run that was interrupted
-    #: and pushed checkpoints to ``hub_checkpoint_repo_id``. When set, the
-    #: base ``model_name`` is loaded and this adapter is applied as the
-    #: starting trainable LoRA (skipping the fresh ``get_peft_model`` call).
-    #: SFT is unnecessary in this case (the adapter is already downstream
-    #: of an SFT warm-start), so leave ``sft_checkpoint`` unset when using
-    #: this flag.
     lora_adapter_repo: Optional[str] = None
     output_dir: str = "runs/physix-1.5b-rl"
     max_seq_length: int = 2048
@@ -97,31 +63,19 @@ class TrainingConfig(BaseModel):
     per_device_train_batch_size: int = 1
     gradient_accumulation_steps: int = 8
     num_steps: int = 300
-    #: Stop early if ``reward_std`` stays below 0.05 for this many consecutive
-    #: logged steps. Set to 0 to disable early stopping.
     early_stop_patience: int = 50
     seed: int = 0
     instances_per_system: int = 32
-    #: Subset of system IDs to train on. Defaults to all SUPPORTED_SYSTEMS.
-    #: Pass a single ID (e.g. ``("damped_spring",)``) for focused single-task runs.
     system_ids: tuple[str, ...] = SUPPORTED_SYSTEMS
     ablation: Optional[Ablation] = None
     wandb_project: str = "physix-live"
     wandb_run_name: Optional[str] = None
     push_to_hub: bool = False
     hub_repo_id: Optional[str] = None
-    #: HF repo to push LoRA checkpoints to every save_steps during GRPO.
-    #: Separate from hub_repo_id (which receives the final merged model).
-    #: Set this to enable mid-run checkpoint persistence and W&B artifact logging.
     hub_checkpoint_repo_id: Optional[str] = None
-    #: Path to a Trainer checkpoint dir to resume GRPO from (e.g. from a
-    #: previous run killed mid-training). Set automatically by train.sh.
     resume_from_checkpoint: Optional[str] = None
-    #: How to persist the final adapter. ``"lora"`` saves only the adapter
-    #: weights (small, requires the base model at load time). ``"merged_16bit"``
-    #: merges the adapter into the base and saves a deployable bf16/fp16
-    #: checkpoint (large, but loadable as a normal HF model — what you want
-    #: for Hub pushes and Ollama exports).
     save_method: SaveMethod = "merged_16bit"
@@ -140,8 +94,6 @@ def train(config: TrainingConfig) -> None:
         resume="allow",
     )
-    # Pin a few high-signal pointers into the run summary right away so the
-    # W&B "Overview" tab shows them prominently (no scrolling, no hunting).
     if config.hub_checkpoint_repo_id:
         ckpt_url = f"https://huggingface.co/{config.hub_checkpoint_repo_id}"
         wandb.run.summary["checkpoint/repo"] = config.hub_checkpoint_repo_id
@@ -156,8 +108,6 @@ def train(config: TrainingConfig) -> None:
         wandb.run.summary["resume/from_url"] = (
             f"https://huggingface.co/{config.lora_adapter_repo}"
         )
-        # If a parent W&B run is named (set by the orchestrator script),
-        # surface it prominently so the lineage is one click away.
         parent_run = os.environ.get("WANDB_RESUMED_FROM")
         if parent_run:
             wandb.run.summary["resume/parent_wandb_run"] = parent_run
@@ -217,21 +167,7 @@ def train(config: TrainingConfig) -> None:
 def _log_reward_summary(trainer: "GRPOTrainer") -> None:
-    """Emit a final reward-signal summary at end of training.
-    Pulls the last ``log_history`` entry that contains reward keys and prints
-    the mean of every ``rewards/*/mean`` it finds. If *no* reward keys are
-    present we hard-fail — that means the reward functions never produced a
-    non-NaN value, which is a real bug worth surfacing.
-    Note on ``train/loss``: this scalar IS the GRPO surrogate objective
-    (advantage-weighted token log-probabilities, plus the KL-to-ref penalty
-    when ``beta > 0``). Per the TRL docs (``trl/docs/source/grpo_trainer.md``)
-    the ``Trainer`` superclass logs the full surrogate as ``loss``, not just
-    the KL term. So ``train/loss`` collapsing without ``train/reward`` rising
-    is a real failure mode — typically a sign of reward hacking or saturated
-    advantages — and should be debugged, not dismissed.
-    """
     history = getattr(trainer.state, "log_history", []) or []
     reward_entries = [
         entry for entry in history
@@ -257,16 +193,6 @@ def _log_reward_summary(trainer: "GRPOTrainer") -> None:
             v1 = last.get(key)
             if isinstance(v0, (int, float)) and isinstance(v1, (int, float)):
                 _log.info("  %-40s %.4f → %.4f  (Δ=%+.4f)", key, v0, v1, v1 - v0)
-    _log.info("-" * 60)
-    _log.info("Interpretation guide:")
-    _log.info("  train/loss     — full GRPO surrogate (policy + KL*beta).")
-    _log.info("                   Should DECREASE as advantages get exploited.")
-    _log.info("  train/reward   — mean episode reward across rollouts.")
-    _log.info("                   Should INCREASE; this is the headline curve.")
-    _log.info("  train/kl       — KL(policy || ref). Should grow slowly.")
-    _log.info("  rewards/*/mean — per-component reward (match, simplicity, …).")
-    _log.info("Loss-down WITHOUT reward-up is a red flag (reward hacking or")
-    _log.info("advantage saturation).")
     _log.info("=" * 60)
@@ -274,32 +200,7 @@ def _render_training_curves(
     trainer: "GRPOTrainer",
     config: TrainingConfig,
 ) -> None:
-    """Render the headline training curves to PNG and ship them.
-    Why we do this in-process at end of training (instead of pulling from
-    W&B post-hoc):
-    1. The competition's automated validation requires PNG plots committed
-       to the public repo at submission time. Wandb-only links don't count.
-    2. ``trainer.state.log_history`` already contains every metric the
-       Trainer logged step-by-step — no API roundtrip needed.
-    3. We can also push the PNGs to the model Hub repo so they're discoverable
-       from the model card without a separate deploy step.
-    Renders three curves:
-    - ``loss.png``                — ``train/loss`` over global step.
-                                    GRPO surrogate; SHOULD trend down.
-    - ``reward.png``              — ``reward`` (or ``train/reward``) over step
-                                    with ±1σ band. SHOULD trend up.
-    - ``reward_components.png``   — overlay of every ``rewards/<name>/mean``
-                                    so reward hacking shows up visually
-                                    (e.g. ``simplicity`` rising while
-                                    ``match`` regresses).
-    Failures are logged and swallowed — a missing plot must not crash a
-    successful training run, since the model artefact is still useful.
-    """
     try:
         import matplotlib
         matplotlib.use("Agg")  # headless / no display server in HF Jobs
@@ -329,7 +230,6 @@ def _render_training_curves(
     rendered: list[Path] = []
-    # 1) Loss — the GRPO surrogate.
     steps_l, losses = _series("loss")
     if steps_l:
         fig, ax = plt.subplots(figsize=(8, 4.5))
@@ -346,7 +246,6 @@ def _render_training_curves(
     else:
         _log.warning("No 'loss' entries in log_history.")
-    # 2) Reward — headline curve (with ±std band when available).
     steps_r, rewards = _series("reward")
     _, reward_std = _series("reward_std")
     if steps_r:
@@ -371,7 +270,6 @@ def _render_training_curves(
     else:
         _log.warning("No 'reward' entries in log_history.")
-    # 3) Per-component reward overlay — exposes reward hacking patterns.
     component_keys = sorted({
         k for entry in history for k in entry
         if k.startswith("rewards/") and k.endswith("/mean")
@@ -400,8 +298,6 @@ def _render_training_curves(
     _log.info("Rendered %d curve PNG(s) to %s", len(rendered), plots_dir)
-    # Log the PNGs as wandb.Images so they appear in the run's Media tab,
-    # and persist to the run summary as a reference table.
     try:
         import wandb
         if wandb.run is not None:
@@ -412,8 +308,6 @@ def _render_training_curves(
     except Exception as exc:  # noqa: BLE001
         _log.warning("Could not log plots to wandb: %s", exc)
-    # Push PNGs to the final Hub model repo under ``plots/`` so the model
-    # card can render them and ``sync-plots.sh`` can pull them locally.
     if config.push_to_hub and config.hub_repo_id:
         try:
             from huggingface_hub import HfApi, create_repo
@@ -445,22 +339,8 @@ def _render_training_curves(
 def _load_model_and_tokenizer(
     config: TrainingConfig,
 ) -> tuple[FastLanguageModel, AutoTokenizer]:
-    """Load Qwen via Unsloth in 4-bit and attach a LoRA adapter.
-    If ``config.sft_checkpoint`` is set, the SFT adapter weights are merged
-    on top of the base model before GRPO starts. This gives GRPO a warm base
-    policy that already knows the JSON format and equation grammar, so early
-    rollouts produce meaningful reward signal instead of all scoring zero.
-    """
     if config.lora_adapter_repo:
-        # Resume path: load the base model and attach the existing LoRA
-        # adapter via PEFT. We deliberately do NOT call
-        # ``FastLanguageModel.from_pretrained(model_name=adapter_repo)``
-        # because the adapter's ``adapter_config.json`` may carry a stale
-        # ``base_model_name_or_path`` pointing at a path that only existed
-        # inside the previous training container (e.g. ``/tmp/physix-sft/merged``).
-        # PEFT's ``load_adapter`` ignores that field — it adapts onto whatever
-        # base we hand it.
         _log.info(
             "Resuming from existing LoRA adapter %s on top of %s",
             config.lora_adapter_repo,
@@ -472,12 +352,6 @@ def _load_model_and_tokenizer(
             load_in_4bit=True,
             dtype=None,
         )
-        # Wrap the base in a fresh trainable LoRA, then overwrite its
-        # weights with the saved adapter. We use the adapter's own r/alpha
-        # by relying on PEFT's ``load_adapter`` resolving from the repo's
-        # adapter_config.json. The dummy ``get_peft_model`` call is just to
-        # turn the model into a ``PeftModel`` instance whose ``load_adapter``
-        # method accepts a hub repo id.
         model = FastLanguageModel.get_peft_model(
             model,
             r=config.lora_r,
@@ -490,8 +364,6 @@ def _load_model_and_tokenizer(
             use_gradient_checkpointing="unsloth",
             random_state=config.seed,
         )
-        # Overwrite the freshly-initialised LoRA weights with the saved ones.
-        # ``adapter_name='default'`` matches what ``get_peft_model`` creates.
         model.load_adapter(
             config.lora_adapter_repo,
             adapter_name="default",
@@ -571,28 +443,7 @@ def _build_and_format_dataset(
 def _select_reward_funcs(ablation: Optional[Ablation]) -> list[object]:
-    """Return the GRPO reward function set.
-    Default set (5 functions, summed by GRPOTrainer into the advantage):
-    - ``reward_match``         — raw R² (linear).
-    - ``reward_match_dense``   — sqrt(R²); dense low-value gradient.
-    - ``reward_correctness``   — binary cliff at R² ≥ 0.70.
-    - ``reward_simplicity``    — gated on R² ≥ 0.10 (anti-hack).
-    - ``reward_format``        — 1.0 only if parsed AND simulated.
-    Why this composition: empirically (RCA from W&B run 5kuqns9x) the
-    previous ``{match, progress, simplicity, format}`` mix had a
-    progress-equals-match duplicate (single-turn ``previous_r_match=0``)
-    AND let the model farm format+simplicity by emitting trivial
-    parseable equations. The new set both removes the duplicate and
-    triple-weights correctness via three different correctness-shaped
-    signals (match, match_dense, correctness_bonus) so that physical
-    accuracy dominates the GRPO advantage.
-    Ablations strip one signal at a time (used by the experiment matrix,
-    not by the main runs).
-    """
     scorer = Scorer()
     funcs = make_reward_funcs(scorer)
     full = [
@@ -609,10 +460,7 @@ def _select_reward_funcs(ablation: Optional[Ablation]) -> list[object]:
     if ablation == "no_format":
         return [funcs["match"], funcs["match_dense"], funcs["correctness"], funcs["simplicity"]]
     if ablation == "no_progress":
-        # Backward-compat alias: ``progress`` no longer exists, the new
-        # reward set already excludes it. Treat ``no_progress`` as the
-        # full default set so old job configs still work without surprise.
-        return full
     raise ValueError(
         f"Unknown ablation {ablation!r}. Choose from "
         "no_progress | no_simplicity | no_format | None."
@@ -620,17 +468,7 @@ def _select_reward_funcs(ablation: Optional[Ablation]) -> list[object]:
 class _RewardConvergenceCallback(TrainerCallback):
-    """Stop training early when the GRPO reward has converged.
-    Convergence criterion: ``reward_std`` (std of total reward across the
-    rollout batch) stays below ``min_std`` for ``patience`` consecutive
-    logged steps.  When ``reward_std ≈ 0`` every generation scores the
-    same, so the GRPO advantage estimates are all zero and the policy
-    gradient vanishes — continuing burns compute without learning.
-    The callback also logs the early-stop event to W&B so the decision
-    is visible on the run page.
-    """
     def __init__(self, patience: int = 50, min_std: float = 0.05) -> None:
         self._patience = patience
@@ -679,37 +517,12 @@ class _RewardConvergenceCallback(TrainerCallback):
 class _WandbCheckpointCallback(TrainerCallback):
-    """Make checkpoints first-class in W&B.
-    After every Trainer save, this callback:
-    1. Resolves the latest commit hash on the Hub repo (best-effort — the
-       trainer's own ``PushToHubCallback`` runs ``git push`` asynchronously
-       so we may briefly see an older commit; that is fine, it self-corrects
-       on the next save).
-    2. Updates the W&B run summary with persistent, prominent keys
-       (visible in the "Overview" tab of the run):
-         - ``checkpoint/last_step``
-         - ``checkpoint/last_commit``
-         - ``checkpoint/repo_url``
-         - ``checkpoint/last_url``
-    3. Logs a step-indexed scalar ``checkpoint/step`` so a chart appears
-       on the W&B run page (one tick per save).
-    4. Maintains a running ``checkpoint_history`` ``wandb.Table`` so every
-       saved checkpoint is browsable as a sortable table directly on the
-       run page (Tables tab).
-    5. Prints a banner to stdout (visible in ``hf jobs logs``) with the
-       direct URL — so the checkpoint is also impossible to miss in the
-       job logs.
-    No model bytes are uploaded to W&B; the actual weights live on the HF
-    Hub checkpoint repo. We never crash training if any of this fails.
-    """
     def __init__(self, hub_checkpoint_repo_id: str) -> None:
         self._repo = hub_checkpoint_repo_id
         self._repo_url = f"https://huggingface.co/{hub_checkpoint_repo_id}"
-        self._table = None  # lazy: wandb may not be initialised at __init__
     def on_train_begin(
         self,
@@ -718,8 +531,6 @@ class _WandbCheckpointCallback(TrainerCallback):
         control: TrainerControl,
         **kwargs,
     ) -> None:
-        # Pin the repo URL into the run config + summary at the very start
-        # so the link is visible on the W&B "Overview" panel from step 0.
         try:
             import wandb
@@ -735,11 +546,6 @@ class _WandbCheckpointCallback(TrainerCallback):
                 f"\n[wandb] Checkpoint repo pinned in run summary: {self._repo_url}\n",
                 flush=True,
             )
-            # Stash the W&B run id at the *root* of the checkpoint repo so a
-            # future re-launch can find it without W&B API calls. Atomic with
-            # checkpoint storage, ~36 bytes. We do this once at train begin
-            # instead of every save to avoid 200 redundant commits.
             self._publish_wandb_run_id(wandb.run.id)
         except Exception as exc:  # noqa: BLE001
             _log.warning("Could not pin checkpoint repo to W&B summary: %s", exc)
@@ -788,28 +594,18 @@ class _WandbCheckpointCallback(TrainerCallback):
                 else f"{self._repo_url}/tree/main"
             )
-            # 1. Persistent summary keys (top-of-run, always visible).
             wandb.run.summary["checkpoint/last_step"] = step
             wandb.run.summary["checkpoint/last_commit"] = commit_sha or "pending"
             wandb.run.summary["checkpoint/last_url"] = tree_url
-            # 2. Step-indexed scalar so a small chart appears on the run page.
             wandb.log({"checkpoint/step": step}, step=step)
-            # 3. Running history table.
             if self._table is None:
                 self._table = wandb.Table(
                     columns=["step", "commit", "url", "repo"]
                 )
             self._table.add_data(step, commit_sha or "pending", tree_url, self._repo)
-            # Re-log the entire table each time so the latest version shows.
             wandb.log({"checkpoint_history": self._table}, step=step)
-            # 4. Pointer-only W&B Artifact (~200 bytes JSON). Doesn't upload
-            #    weights — those are on the Hub already — but makes every
-            #    checkpoint a first-class, addressable W&B artifact that can
-            #    be looked up later by `wandb artifact get`. Side effect:
-            #    populates the run's "Artifacts" panel with one entry per save.
             if commit_sha:
                 from physix.training.checkpoints import (
                     CheckpointHandle,
@@ -826,7 +622,6 @@ class _WandbCheckpointCallback(TrainerCallback):
                     artifact_name="physix-grpo-checkpoint",
                 )
-            # 5. Stdout banner — also visible in `hf jobs logs`.
             print(
                 "\n"
                 "================ CHECKPOINT SAVED ================\n"
@@ -852,13 +647,7 @@ class _WandbCheckpointCallback(TrainerCallback):
             )
     def _latest_commit_sha(self) -> Optional[str]:
-        """Best-effort fetch of the most recent commit on the checkpoint repo.
-        Uses ``HfApi.list_repo_commits`` if available; returns ``None`` on
-        any failure. The async ``git push`` may not be done at the instant
-        ``on_save`` fires, so we may see the *previous* checkpoint's commit;
-        that's acceptable — it self-corrects on the next save.
-        """
         try:
             from huggingface_hub import HfApi
@@ -872,19 +661,6 @@ class _WandbCheckpointCallback(TrainerCallback):
 def _build_grpo_config(config: TrainingConfig) -> GRPOConfig:
-    # Note on the metrics this run will produce in W&B (per TRL docs):
-    #   train/loss     — the GRPO surrogate objective being minimized.
-    #                    = -E[advantage * logπ(action|state)] + β * KL.
-    #                    Should DECREASE as the policy exploits advantages.
-    #   train/reward   — mean total reward per rollout. Should INCREASE.
-    #   train/kl       — KL(policy || reference). Bounded by β; grows slowly.
-    #   rewards/<f>/mean — per-component reward (one per reward function).
-    #
-    # ``train/loss`` going to ~0 *only* if ``train/reward`` rises in lockstep
-    # is fine — it just means advantages got fully exploited. Loss collapsing
-    # without reward growth is reward hacking, broken parsing, or a saturated
-    # KL anchor. We surface both via _log_reward_summary at end of training
-    # AND via _GenerateCurvesCallback which renders both curves to PNG.
     effective_batch = (
         config.per_device_train_batch_size * config.gradient_accumulation_steps
     )
@@ -933,20 +709,7 @@ def _save_artifacts(
     tokenizer: AutoTokenizer,
     config: TrainingConfig,
 ) -> None:
-    """Persist the trained adapter via Unsloth's save path.
-    ``save_pretrained_merged`` dispatches on ``save_method``:
-    - ``"lora"``: writes only the adapter weights (small; requires the base
-      model at load time).
-    - ``"merged_16bit"``: merges LoRA into base and writes a standard HF
-      checkpoint in bf16/fp16 (large; loadable without Unsloth, exportable to
-      GGUF for Ollama).
-    - ``"merged_4bit"``: same merge but quantised back to 4-bit.
-    Hub pushes use the same ``save_method`` so the on-disk artifact and the
-    Hub artifact are byte-identical.
-    """
     out_path = Path(config.output_dir)
     out_path.mkdir(parents=True, exist_ok=True)

+"""GRPO training loop using Unsloth + TRL.
+Requires the ``[train]`` optional dependency group (heavy ML deps).
 """
 from __future__ import annotations
 from physix.training.reward_fns import make_reward_funcs
 from physix.training.scorer import Scorer
+# Unsloth patches must be applied before importing GRPOTrainer — order matters.
+# Requires trl<=0.24.0; newer versions break PatchFastRL.
 from unsloth import FastLanguageModel, PatchFastRL  # noqa: E402
 PatchFastRL("GRPO", FastLanguageModel)
     model_config = ConfigDict(frozen=True)
     model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"
+    #: Path to merged SFT model to warm-start GRPO from.
     sft_checkpoint: Optional[str] = None
+    #: Hub repo id or local path of an existing LoRA adapter to resume from.
     lora_adapter_repo: Optional[str] = None
     output_dir: str = "runs/physix-1.5b-rl"
     max_seq_length: int = 2048
     per_device_train_batch_size: int = 1
     gradient_accumulation_steps: int = 8
     num_steps: int = 300
+    #: Set to 0 to disable early stopping.
     early_stop_patience: int = 50
     seed: int = 0
     instances_per_system: int = 32
     system_ids: tuple[str, ...] = SUPPORTED_SYSTEMS
     ablation: Optional[Ablation] = None
     wandb_project: str = "physix-live"
     wandb_run_name: Optional[str] = None
     push_to_hub: bool = False
     hub_repo_id: Optional[str] = None
+    #: HF repo to push LoRA checkpoints to every save_steps.
     hub_checkpoint_repo_id: Optional[str] = None
     resume_from_checkpoint: Optional[str] = None
     save_method: SaveMethod = "merged_16bit"
         resume="allow",
     )
     if config.hub_checkpoint_repo_id:
         ckpt_url = f"https://huggingface.co/{config.hub_checkpoint_repo_id}"
         wandb.run.summary["checkpoint/repo"] = config.hub_checkpoint_repo_id
         wandb.run.summary["resume/from_url"] = (
             f"https://huggingface.co/{config.lora_adapter_repo}"
         )
         parent_run = os.environ.get("WANDB_RESUMED_FROM")
         if parent_run:
             wandb.run.summary["resume/parent_wandb_run"] = parent_run
 def _log_reward_summary(trainer: "GRPOTrainer") -> None:
+    """Log first→last reward delta for every component. Raises if no rewards were logged."""
     history = getattr(trainer.state, "log_history", []) or []
     reward_entries = [
         entry for entry in history
             v1 = last.get(key)
             if isinstance(v0, (int, float)) and isinstance(v1, (int, float)):
                 _log.info("  %-40s %.4f → %.4f  (Δ=%+.4f)", key, v0, v1, v1 - v0)
     _log.info("=" * 60)
     trainer: "GRPOTrainer",
     config: TrainingConfig,
 ) -> None:
+    """Render loss/reward/component PNGs from log_history and push to Hub."""
     try:
         import matplotlib
         matplotlib.use("Agg")  # headless / no display server in HF Jobs
     rendered: list[Path] = []
     steps_l, losses = _series("loss")
     if steps_l:
         fig, ax = plt.subplots(figsize=(8, 4.5))
     else:
         _log.warning("No 'loss' entries in log_history.")
     steps_r, rewards = _series("reward")
     _, reward_std = _series("reward_std")
     if steps_r:
     else:
         _log.warning("No 'reward' entries in log_history.")
     component_keys = sorted({
         k for entry in history for k in entry
         if k.startswith("rewards/") and k.endswith("/mean")
     _log.info("Rendered %d curve PNG(s) to %s", len(rendered), plots_dir)
     try:
         import wandb
         if wandb.run is not None:
     except Exception as exc:  # noqa: BLE001
         _log.warning("Could not log plots to wandb: %s", exc)
     if config.push_to_hub and config.hub_repo_id:
         try:
             from huggingface_hub import HfApi, create_repo
 def _load_model_and_tokenizer(
     config: TrainingConfig,
 ) -> tuple[FastLanguageModel, AutoTokenizer]:
+    """Load model via Unsloth in 4-bit and attach a LoRA adapter."""
     if config.lora_adapter_repo:
         _log.info(
             "Resuming from existing LoRA adapter %s on top of %s",
             config.lora_adapter_repo,
             load_in_4bit=True,
             dtype=None,
         )
         model = FastLanguageModel.get_peft_model(
             model,
             r=config.lora_r,
             use_gradient_checkpointing="unsloth",
             random_state=config.seed,
         )
         model.load_adapter(
             config.lora_adapter_repo,
             adapter_name="default",
 def _select_reward_funcs(ablation: Optional[Ablation]) -> list[object]:
+    """Return the active reward function list, optionally with one signal ablated."""
     scorer = Scorer()
     funcs = make_reward_funcs(scorer)
     full = [
     if ablation == "no_format":
         return [funcs["match"], funcs["match_dense"], funcs["correctness"], funcs["simplicity"]]
     if ablation == "no_progress":
+        return full  # progress was removed; treat as full set for backward compat
     raise ValueError(
         f"Unknown ablation {ablation!r}. Choose from "
         "no_progress | no_simplicity | no_format | None."
 class _RewardConvergenceCallback(TrainerCallback):
+    """Stop early when reward_std stays below min_std for `patience` consecutive steps."""
     def __init__(self, patience: int = 50, min_std: float = 0.05) -> None:
         self._patience = patience
 class _WandbCheckpointCallback(TrainerCallback):
+    """Log checkpoint metadata to W&B summary and stdout after each Trainer save."""
     def __init__(self, hub_checkpoint_repo_id: str) -> None:
         self._repo = hub_checkpoint_repo_id
         self._repo_url = f"https://huggingface.co/{hub_checkpoint_repo_id}"
+        self._table = None
     def on_train_begin(
         self,
         control: TrainerControl,
         **kwargs,
     ) -> None:
         try:
             import wandb
                 f"\n[wandb] Checkpoint repo pinned in run summary: {self._repo_url}\n",
                 flush=True,
             )
             self._publish_wandb_run_id(wandb.run.id)
         except Exception as exc:  # noqa: BLE001
             _log.warning("Could not pin checkpoint repo to W&B summary: %s", exc)
                 else f"{self._repo_url}/tree/main"
             )
             wandb.run.summary["checkpoint/last_step"] = step
             wandb.run.summary["checkpoint/last_commit"] = commit_sha or "pending"
             wandb.run.summary["checkpoint/last_url"] = tree_url
             wandb.log({"checkpoint/step": step}, step=step)
             if self._table is None:
                 self._table = wandb.Table(
                     columns=["step", "commit", "url", "repo"]
                 )
             self._table.add_data(step, commit_sha or "pending", tree_url, self._repo)
             wandb.log({"checkpoint_history": self._table}, step=step)
             if commit_sha:
                 from physix.training.checkpoints import (
                     CheckpointHandle,
                     artifact_name="physix-grpo-checkpoint",
                 )
             print(
                 "\n"
                 "================ CHECKPOINT SAVED ================\n"
             )
     def _latest_commit_sha(self) -> Optional[str]:
+        """Best-effort fetch of the latest commit SHA; returns None on failure."""
         try:
             from huggingface_hub import HfApi
 def _build_grpo_config(config: TrainingConfig) -> GRPOConfig:
     effective_batch = (
         config.per_device_train_batch_size * config.gradient_accumulation_steps
     )
     tokenizer: AutoTokenizer,
     config: TrainingConfig,
 ) -> None:
+    """Save model locally and optionally push to Hub."""
     out_path = Path(config.output_dir)
     out_path.mkdir(parents=True, exist_ok=True)