AxiomForgeAI / src /rl /expert_panel.py
jampuramprem's picture
Initial Space deployment
ec4ae03
"""
Simulated expert panel with shifting preferences across curriculum phases.
Reward-shaping design notes
---------------------------
Historically this panel applied a **multiplicative** shaping step:
adjusted = clip01( base * (1 + modifier) ), modifier in [-0.3, +0.3]
Two problems that analysis of 20 PPO iterations made obvious:
1. Saturation. Any base >= 0.77 was clipped to exactly 1.0 with the
maximum boost, and a large fraction of self-play rollouts land in
that zone every iteration. After the rollout buffer whitens
advantages, a cluster of identical 1.0s flattens the policy
gradient — that's the "policy_loss ~ -0.004 across every
iteration" signature. Meanwhile the rare non-saturated outlier
produces a huge standardized advantage -> KL spikes -> early stop.
2. PRM triple-counting. The panel used ``correctness`` and
``consensus`` weights, and the caller wired both to ``PRM_mean``.
Combined with the PRM terms inside ``sol`` itself, a single frozen
PRM's opinion drove ~75% of the variance in ``combined``. The
policy can game that by finding text the PRM likes without the
answer being correct.
The replacement here is:
* **Additive** shaping with a tight bound (|modifier| <= 0.08 by
default). No multiplication, no clip-to-1. ``base`` stays in
[0, 1] as computed by the environment, and shaping only nudges it
a little — GAE + advantage normalization handle scale downstream.
* The panel no longer consumes the PRM-correlated signals
(``correctness``, ``consensus_score``). Those already live inside
``sol``. What the panel *does* add is curriculum-phase taste:
clarity, solvability, difficulty match, novelty, format
compliance.
* A harder, one-sided format penalty: badly-formatted outputs get
penalized more than well-formatted ones get rewarded. Solutions
that don't even parse should not win ties over ones that do.
Nothing about the public API changes — the returned dict still has
``adjusted_reward``, ``reward_modifier``, ``raw_modifier``,
``phase``/``description``, ``signals``, and ``feedback``.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, Optional
# Tight additive bound. With base in [0, 1] this keeps the final reward
# inside roughly [-0.08, 1.08]; the environment re-clips to [0, 1].
# Lower than the old 0.3 on purpose — shaping is a flavor term, not the
# main signal.
MAX_MODIFIER = 0.08
@dataclass(frozen=True)
class ExpertPhase:
name: str
start_iteration: int
end_iteration: Optional[int]
clarity_weight: float
solvability_weight: float
difficulty_weight: float
novelty_weight: float
format_penalty_weight: float
description: str
def active_for(self, iteration: int) -> bool:
if iteration < self.start_iteration:
return False
if self.end_iteration is None:
return True
return iteration <= self.end_iteration
class SimulatedExpertPanel:
"""Applies phase-specific bounded **additive** reward shaping.
No more multiplication, no more clip-to-1, and crucially no more
``correctness``/``consensus`` knobs (which used to double-count
PRM_mean on top of ``sol``). The panel now only shapes question
quality and format — the correctness signal lives in ``sol`` alone.
"""
def __init__(self) -> None:
self._phases: List[ExpertPhase] = [
ExpertPhase(
name="pedagogy",
start_iteration=0,
end_iteration=3,
clarity_weight=0.30,
solvability_weight=0.25,
difficulty_weight=-0.10,
novelty_weight=0.00,
format_penalty_weight=0.40,
description="Prioritize clear, learnable, and solvable foundation tasks.",
),
ExpertPhase(
name="accuracy",
start_iteration=4,
end_iteration=6,
clarity_weight=0.10,
solvability_weight=0.20,
difficulty_weight=0.00,
novelty_weight=0.00,
format_penalty_weight=0.70,
description="Prioritize arithmetic correctness and agreement stability.",
),
ExpertPhase(
name="challenge",
start_iteration=7,
end_iteration=None,
clarity_weight=0.10,
solvability_weight=0.10,
difficulty_weight=0.30,
novelty_weight=0.20,
format_penalty_weight=0.30,
description="Prioritize challenging, novel, and diverse problems.",
),
]
def get_current_expert(self, iteration: int) -> ExpertPhase:
for phase in self._phases:
if phase.active_for(iteration):
return phase
return self._phases[-1]
def apply_expert_preferences(
self,
base_reward: float,
question_metrics: Dict[str, object],
solution_metrics: Dict[str, object],
iteration: int,
) -> Dict[str, object]:
phase = self.get_current_expert(iteration)
clarity = float(question_metrics.get("clarity", 0.0))
solvability = float(question_metrics.get("solvability_score", 0.0))
difficulty = float(question_metrics.get("difficulty_score", 0.0))
novelty = float(question_metrics.get("novelty_combined", 0.0))
format_compliance = float(solution_metrics.get("format_compliance", 0.0))
format_penalty = 1.0 - format_compliance
# Centered versions keep the additive shaping close to zero when
# quality signals are average; only genuinely good (>0.5) or
# genuinely bad (<0.5) questions move the needle. Without this,
# every single rollout got a +0.15 bump just for producing a
# non-empty string.
clarity_c = clarity - 0.5
solvability_c = solvability - 0.5
difficulty_c = difficulty - 0.5
novelty_c = novelty - 0.5
raw_modifier = (
phase.clarity_weight * clarity_c
+ phase.solvability_weight * solvability_c
+ phase.difficulty_weight * difficulty_c
+ phase.novelty_weight * novelty_c
- phase.format_penalty_weight * format_penalty
)
modifier = max(-MAX_MODIFIER, min(MAX_MODIFIER, raw_modifier))
# Additive, no multiplication. We leave the final [0, 1] clip to
# the caller (math_environment_curriculum) so it can combine the
# shaping with its own format-floor rule.
adjusted_reward = float(base_reward) + modifier
return {
"phase": phase.name,
"description": phase.description,
"phase_start_iteration": phase.start_iteration,
"phase_end_iteration": phase.end_iteration,
"base_reward": float(base_reward),
"adjusted_reward": adjusted_reward,
"reward_modifier": modifier,
"raw_modifier": raw_modifier,
"signals": {
"clarity": clarity,
"solvability": solvability,
"difficulty_score": difficulty,
"novelty": novelty,
"format_compliance": format_compliance,
},
"feedback": self.get_expert_feedback(
phase_name=phase.name,
reward_modifier=modifier,
signals={
"clarity": clarity,
"solvability": solvability,
"difficulty_score": difficulty,
"novelty": novelty,
"format_compliance": format_compliance,
},
),
}
def get_expert_feedback(
self,
phase_name: str,
reward_modifier: float,
signals: Dict[str, float],
) -> str:
direction = "boosted" if reward_modifier >= 0 else "penalized"
if phase_name == "pedagogy":
return (
f"Pedagogy expert {direction} reward; clarity={signals['clarity']:.2f}, "
f"solvability={signals['solvability']:.2f}, difficulty={signals['difficulty_score']:.2f}."
)
if phase_name == "accuracy":
return (
f"Accuracy expert {direction} reward; solvability={signals['solvability']:.2f}, "
f"format={signals['format_compliance']:.2f}."
)
return (
f"Challenge expert {direction} reward; difficulty={signals['difficulty_score']:.2f}, "
f"novelty={signals['novelty']:.2f}, format={signals['format_compliance']:.2f}."
)