Spaces:

Pratyush-01
/

physix-live

Sleeping

App Files Files Community

Pratyush-01 commited on 12 days ago

Commit

0128624

verified ·

1 Parent(s): b4bd6d8

cleanup: strip verbose comments from physix/training/reward_fns.py

Browse files

Files changed (1) hide show

physix/training/reward_fns.py +4 -64

physix/training/reward_fns.py CHANGED Viewed

@@ -1,34 +1,4 @@
-"""TRL-compatible reward functions for GRPO training.
-Responsibility: expose a stateless reward function for each independent
-reward signal. Internally each component delegates to a shared
-:class:`Scorer` so a single completion is parsed and simulated exactly
-once per training step regardless of how many reward functions query it.
-The TRL signature for a reward function is::
-    def reward_func(*, prompts, completions, **kwargs) -> list[float]: ...
-where ``prompts`` and ``completions`` are batched lists. Extra columns from
-the training dataset arrive as keyword arguments — we expect the columns
-listed in :class:`SystemContext` to be present.
-Reward set design (anti-hack, RCA from W&B run 5kuqns9x):
-- ``reward_match``        — raw R² on the trajectory (linear).
-- ``reward_match_dense``  — sqrt(R²); denser gradient at low values.
-- ``reward_correctness``  — binary cliff at R² ≥ 0.70; pushes past plateau.
-- ``reward_simplicity``   — gated on R² ≥ 0.10 (no free reward for trivial
-                            equations).
-- ``reward_format``       — 1.0 only if the equation parsed *and*
-                            simulated. No partial credit for parseable
-                            but uncomputable garbage.
-The legacy ``reward_progress`` is intentionally absent. In single-turn
-GRPO every dataset row carries ``previous_r_match=0``, which made
-``progress = max(0, match - 0) = match`` for every rollout — a perfect
-duplicate of ``reward_match`` that diluted advantage estimation.
-"""
 from __future__ import annotations
@@ -45,31 +15,7 @@ RewardFunction = Callable[..., list[float]]
 def make_reward_funcs(
     scorer: Scorer | None = None,
 ) -> dict[str, RewardFunction]:
-    """Build a fresh dict of reward functions wired to a shared scorer.
-    Each function is named ``reward_<component>`` so TRL's GRPO trainer
-    logs them individually to W&B under
-    ``train/rewards/reward_<component>/mean``.
-    The scorer is shared across all functions. TRL calls reward functions
-    one-by-one for the same batch (same ``completions`` list, same indices).
-    The ``match`` function resets the cache and populates it; the
-    remaining functions (``match_dense``, ``correctness``, ``simplicity``,
-    ``format``) reuse the cached results via ``cache_key=i``. This means
-    each completion is parsed + simulated exactly once per step regardless
-    of how many reward functions query it.
-    Returns a dict whose keys are:
-    - ``match`` / ``simplicity`` / ``format`` — direct reads from the
-      :class:`RewardBreakdown`. ``simplicity`` is internally gated on
-      match ≥ 0.10 and ``format`` on simulation success.
-    - ``match_dense`` — ``sqrt(match)`` for denser low-value gradient.
-    - ``correctness`` — binary 1.0 above an R² threshold (``0.70``).
-    All functions share the scorer cache, so they cost one parse +
-    simulate per completion combined, not five.
-    """
     shared = scorer if scorer is not None else Scorer()
     def _make_breakdown_reader(component: str, *, reset_cache: bool) -> RewardFunction:
@@ -125,8 +71,7 @@ def make_reward_funcs(
     _reward_correctness.__name__ = "reward_correctness"
-    # ``match`` is always the first function TRL calls; it resets the cache
-    # so subsequent functions get fresh results for this step's completions.
     funcs: dict[str, RewardFunction] = {
         "match": _make_breakdown_reader("match", reset_cache=True),
         "simplicity": _make_breakdown_reader("simplicity", reset_cache=False),
@@ -138,12 +83,7 @@ def make_reward_funcs(
 def _hydrate_contexts(batch_size: int, kwargs: dict[str, Any]) -> list[SystemContext]:
-    """Project per-row kwargs into :class:`SystemContext` records.
-    TRL passes dataset columns as kwargs where each value is a list of
-    length ``batch_size``. We zip them together into per-row dicts and hand
-    each off to :func:`SystemContext.from_row`.
-    """
     expected_keys = (
         "system_id",
         "state_variables",

+"""TRL-compatible reward functions for GRPO training."""
 from __future__ import annotations
 def make_reward_funcs(
     scorer: Scorer | None = None,
 ) -> dict[str, RewardFunction]:
+    """Build reward functions keyed by component name, sharing a single scorer cache."""
     shared = scorer if scorer is not None else Scorer()
     def _make_breakdown_reader(component: str, *, reset_cache: bool) -> RewardFunction:
     _reward_correctness.__name__ = "reward_correctness"
+    # match resets the cache first so subsequent functions reuse parsed results.
     funcs: dict[str, RewardFunction] = {
         "match": _make_breakdown_reader("match", reset_cache=True),
         "simplicity": _make_breakdown_reader("simplicity", reset_cache=False),
 def _hydrate_contexts(batch_size: int, kwargs: dict[str, Any]) -> list[SystemContext]:
+    """Convert TRL batch kwargs into per-row SystemContext records."""
     expected_keys = (
         "system_id",
         "state_variables",