Spaces:

jampuramprem
/

AxiomForgeAI

Sleeping

File size: 8,790 Bytes

ec4ae03

"""
Simulated expert panel with shifting preferences across curriculum phases.

Reward-shaping design notes
---------------------------
Historically this panel applied a **multiplicative** shaping step:

    adjusted = clip01( base * (1 + modifier) ),  modifier in [-0.3, +0.3]

Two problems that analysis of 20 PPO iterations made obvious:

1.  Saturation.  Any base >= 0.77 was clipped to exactly 1.0 with the
    maximum boost, and a large fraction of self-play rollouts land in
    that zone every iteration.  After the rollout buffer whitens
    advantages, a cluster of identical 1.0s flattens the policy
    gradient — that's the "policy_loss ~ -0.004 across every
    iteration" signature.  Meanwhile the rare non-saturated outlier
    produces a huge standardized advantage -> KL spikes -> early stop.

2.  PRM triple-counting.  The panel used ``correctness`` and
    ``consensus`` weights, and the caller wired both to ``PRM_mean``.
    Combined with the PRM terms inside ``sol`` itself, a single frozen
    PRM's opinion drove ~75% of the variance in ``combined``.  The
    policy can game that by finding text the PRM likes without the
    answer being correct.

The replacement here is:

*   **Additive** shaping with a tight bound (|modifier| <= 0.08 by
    default).  No multiplication, no clip-to-1.  ``base`` stays in
    [0, 1] as computed by the environment, and shaping only nudges it
    a little — GAE + advantage normalization handle scale downstream.
*   The panel no longer consumes the PRM-correlated signals
    (``correctness``, ``consensus_score``).  Those already live inside
    ``sol``.  What the panel *does* add is curriculum-phase taste:
    clarity, solvability, difficulty match, novelty, format
    compliance.
*   A harder, one-sided format penalty: badly-formatted outputs get
    penalized more than well-formatted ones get rewarded.  Solutions
    that don't even parse should not win ties over ones that do.

Nothing about the public API changes — the returned dict still has
``adjusted_reward``, ``reward_modifier``, ``raw_modifier``,
``phase``/``description``, ``signals``, and ``feedback``.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, List, Optional


# Tight additive bound. With base in [0, 1] this keeps the final reward
# inside roughly [-0.08, 1.08]; the environment re-clips to [0, 1].
# Lower than the old 0.3 on purpose — shaping is a flavor term, not the
# main signal.
MAX_MODIFIER = 0.08


@dataclass(frozen=True)
class ExpertPhase:
    name: str
    start_iteration: int
    end_iteration: Optional[int]
    clarity_weight: float
    solvability_weight: float
    difficulty_weight: float
    novelty_weight: float
    format_penalty_weight: float
    description: str

    def active_for(self, iteration: int) -> bool:
        if iteration < self.start_iteration:
            return False
        if self.end_iteration is None:
            return True
        return iteration <= self.end_iteration


class SimulatedExpertPanel:
    """Applies phase-specific bounded **additive** reward shaping.

    No more multiplication, no more clip-to-1, and crucially no more
    ``correctness``/``consensus`` knobs (which used to double-count
    PRM_mean on top of ``sol``).  The panel now only shapes question
    quality and format — the correctness signal lives in ``sol`` alone.
    """

    def __init__(self) -> None:
        self._phases: List[ExpertPhase] = [
            ExpertPhase(
                name="pedagogy",
                start_iteration=0,
                end_iteration=3,
                clarity_weight=0.30,
                solvability_weight=0.25,
                difficulty_weight=-0.10,
                novelty_weight=0.00,
                format_penalty_weight=0.40,
                description="Prioritize clear, learnable, and solvable foundation tasks.",
            ),
            ExpertPhase(
                name="accuracy",
                start_iteration=4,
                end_iteration=6,
                clarity_weight=0.10,
                solvability_weight=0.20,
                difficulty_weight=0.00,
                novelty_weight=0.00,
                format_penalty_weight=0.70,
                description="Prioritize arithmetic correctness and agreement stability.",
            ),
            ExpertPhase(
                name="challenge",
                start_iteration=7,
                end_iteration=None,
                clarity_weight=0.10,
                solvability_weight=0.10,
                difficulty_weight=0.30,
                novelty_weight=0.20,
                format_penalty_weight=0.30,
                description="Prioritize challenging, novel, and diverse problems.",
            ),
        ]

    def get_current_expert(self, iteration: int) -> ExpertPhase:
        for phase in self._phases:
            if phase.active_for(iteration):
                return phase
        return self._phases[-1]

    def apply_expert_preferences(
        self,
        base_reward: float,
        question_metrics: Dict[str, object],
        solution_metrics: Dict[str, object],
        iteration: int,
    ) -> Dict[str, object]:
        phase = self.get_current_expert(iteration)

        clarity = float(question_metrics.get("clarity", 0.0))
        solvability = float(question_metrics.get("solvability_score", 0.0))
        difficulty = float(question_metrics.get("difficulty_score", 0.0))
        novelty = float(question_metrics.get("novelty_combined", 0.0))
        format_compliance = float(solution_metrics.get("format_compliance", 0.0))
        format_penalty = 1.0 - format_compliance

        # Centered versions keep the additive shaping close to zero when
        # quality signals are average; only genuinely good (>0.5) or
        # genuinely bad (<0.5) questions move the needle.  Without this,
        # every single rollout got a +0.15 bump just for producing a
        # non-empty string.
        clarity_c = clarity - 0.5
        solvability_c = solvability - 0.5
        difficulty_c = difficulty - 0.5
        novelty_c = novelty - 0.5

        raw_modifier = (
            phase.clarity_weight * clarity_c
            + phase.solvability_weight * solvability_c
            + phase.difficulty_weight * difficulty_c
            + phase.novelty_weight * novelty_c
            - phase.format_penalty_weight * format_penalty
        )
        modifier = max(-MAX_MODIFIER, min(MAX_MODIFIER, raw_modifier))

        # Additive, no multiplication.  We leave the final [0, 1] clip to
        # the caller (math_environment_curriculum) so it can combine the
        # shaping with its own format-floor rule.
        adjusted_reward = float(base_reward) + modifier
        return {
            "phase": phase.name,
            "description": phase.description,
            "phase_start_iteration": phase.start_iteration,
            "phase_end_iteration": phase.end_iteration,
            "base_reward": float(base_reward),
            "adjusted_reward": adjusted_reward,
            "reward_modifier": modifier,
            "raw_modifier": raw_modifier,
            "signals": {
                "clarity": clarity,
                "solvability": solvability,
                "difficulty_score": difficulty,
                "novelty": novelty,
                "format_compliance": format_compliance,
            },
            "feedback": self.get_expert_feedback(
                phase_name=phase.name,
                reward_modifier=modifier,
                signals={
                    "clarity": clarity,
                    "solvability": solvability,
                    "difficulty_score": difficulty,
                    "novelty": novelty,
                    "format_compliance": format_compliance,
                },
            ),
        }

    def get_expert_feedback(
        self,
        phase_name: str,
        reward_modifier: float,
        signals: Dict[str, float],
    ) -> str:
        direction = "boosted" if reward_modifier >= 0 else "penalized"
        if phase_name == "pedagogy":
            return (
                f"Pedagogy expert {direction} reward; clarity={signals['clarity']:.2f}, "
                f"solvability={signals['solvability']:.2f}, difficulty={signals['difficulty_score']:.2f}."
            )
        if phase_name == "accuracy":
            return (
                f"Accuracy expert {direction} reward; solvability={signals['solvability']:.2f}, "
                f"format={signals['format_compliance']:.2f}."
            )
        return (
            f"Challenge expert {direction} reward; difficulty={signals['difficulty_score']:.2f}, "
            f"novelty={signals['novelty']:.2f}, format={signals['format_compliance']:.2f}."
        )