Spaces:
Sleeping
Sleeping
File size: 8,790 Bytes
ec4ae03 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | """
Simulated expert panel with shifting preferences across curriculum phases.
Reward-shaping design notes
---------------------------
Historically this panel applied a **multiplicative** shaping step:
adjusted = clip01( base * (1 + modifier) ), modifier in [-0.3, +0.3]
Two problems that analysis of 20 PPO iterations made obvious:
1. Saturation. Any base >= 0.77 was clipped to exactly 1.0 with the
maximum boost, and a large fraction of self-play rollouts land in
that zone every iteration. After the rollout buffer whitens
advantages, a cluster of identical 1.0s flattens the policy
gradient β that's the "policy_loss ~ -0.004 across every
iteration" signature. Meanwhile the rare non-saturated outlier
produces a huge standardized advantage -> KL spikes -> early stop.
2. PRM triple-counting. The panel used ``correctness`` and
``consensus`` weights, and the caller wired both to ``PRM_mean``.
Combined with the PRM terms inside ``sol`` itself, a single frozen
PRM's opinion drove ~75% of the variance in ``combined``. The
policy can game that by finding text the PRM likes without the
answer being correct.
The replacement here is:
* **Additive** shaping with a tight bound (|modifier| <= 0.08 by
default). No multiplication, no clip-to-1. ``base`` stays in
[0, 1] as computed by the environment, and shaping only nudges it
a little β GAE + advantage normalization handle scale downstream.
* The panel no longer consumes the PRM-correlated signals
(``correctness``, ``consensus_score``). Those already live inside
``sol``. What the panel *does* add is curriculum-phase taste:
clarity, solvability, difficulty match, novelty, format
compliance.
* A harder, one-sided format penalty: badly-formatted outputs get
penalized more than well-formatted ones get rewarded. Solutions
that don't even parse should not win ties over ones that do.
Nothing about the public API changes β the returned dict still has
``adjusted_reward``, ``reward_modifier``, ``raw_modifier``,
``phase``/``description``, ``signals``, and ``feedback``.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, Optional
# Tight additive bound. With base in [0, 1] this keeps the final reward
# inside roughly [-0.08, 1.08]; the environment re-clips to [0, 1].
# Lower than the old 0.3 on purpose β shaping is a flavor term, not the
# main signal.
MAX_MODIFIER = 0.08
@dataclass(frozen=True)
class ExpertPhase:
name: str
start_iteration: int
end_iteration: Optional[int]
clarity_weight: float
solvability_weight: float
difficulty_weight: float
novelty_weight: float
format_penalty_weight: float
description: str
def active_for(self, iteration: int) -> bool:
if iteration < self.start_iteration:
return False
if self.end_iteration is None:
return True
return iteration <= self.end_iteration
class SimulatedExpertPanel:
"""Applies phase-specific bounded **additive** reward shaping.
No more multiplication, no more clip-to-1, and crucially no more
``correctness``/``consensus`` knobs (which used to double-count
PRM_mean on top of ``sol``). The panel now only shapes question
quality and format β the correctness signal lives in ``sol`` alone.
"""
def __init__(self) -> None:
self._phases: List[ExpertPhase] = [
ExpertPhase(
name="pedagogy",
start_iteration=0,
end_iteration=3,
clarity_weight=0.30,
solvability_weight=0.25,
difficulty_weight=-0.10,
novelty_weight=0.00,
format_penalty_weight=0.40,
description="Prioritize clear, learnable, and solvable foundation tasks.",
),
ExpertPhase(
name="accuracy",
start_iteration=4,
end_iteration=6,
clarity_weight=0.10,
solvability_weight=0.20,
difficulty_weight=0.00,
novelty_weight=0.00,
format_penalty_weight=0.70,
description="Prioritize arithmetic correctness and agreement stability.",
),
ExpertPhase(
name="challenge",
start_iteration=7,
end_iteration=None,
clarity_weight=0.10,
solvability_weight=0.10,
difficulty_weight=0.30,
novelty_weight=0.20,
format_penalty_weight=0.30,
description="Prioritize challenging, novel, and diverse problems.",
),
]
def get_current_expert(self, iteration: int) -> ExpertPhase:
for phase in self._phases:
if phase.active_for(iteration):
return phase
return self._phases[-1]
def apply_expert_preferences(
self,
base_reward: float,
question_metrics: Dict[str, object],
solution_metrics: Dict[str, object],
iteration: int,
) -> Dict[str, object]:
phase = self.get_current_expert(iteration)
clarity = float(question_metrics.get("clarity", 0.0))
solvability = float(question_metrics.get("solvability_score", 0.0))
difficulty = float(question_metrics.get("difficulty_score", 0.0))
novelty = float(question_metrics.get("novelty_combined", 0.0))
format_compliance = float(solution_metrics.get("format_compliance", 0.0))
format_penalty = 1.0 - format_compliance
# Centered versions keep the additive shaping close to zero when
# quality signals are average; only genuinely good (>0.5) or
# genuinely bad (<0.5) questions move the needle. Without this,
# every single rollout got a +0.15 bump just for producing a
# non-empty string.
clarity_c = clarity - 0.5
solvability_c = solvability - 0.5
difficulty_c = difficulty - 0.5
novelty_c = novelty - 0.5
raw_modifier = (
phase.clarity_weight * clarity_c
+ phase.solvability_weight * solvability_c
+ phase.difficulty_weight * difficulty_c
+ phase.novelty_weight * novelty_c
- phase.format_penalty_weight * format_penalty
)
modifier = max(-MAX_MODIFIER, min(MAX_MODIFIER, raw_modifier))
# Additive, no multiplication. We leave the final [0, 1] clip to
# the caller (math_environment_curriculum) so it can combine the
# shaping with its own format-floor rule.
adjusted_reward = float(base_reward) + modifier
return {
"phase": phase.name,
"description": phase.description,
"phase_start_iteration": phase.start_iteration,
"phase_end_iteration": phase.end_iteration,
"base_reward": float(base_reward),
"adjusted_reward": adjusted_reward,
"reward_modifier": modifier,
"raw_modifier": raw_modifier,
"signals": {
"clarity": clarity,
"solvability": solvability,
"difficulty_score": difficulty,
"novelty": novelty,
"format_compliance": format_compliance,
},
"feedback": self.get_expert_feedback(
phase_name=phase.name,
reward_modifier=modifier,
signals={
"clarity": clarity,
"solvability": solvability,
"difficulty_score": difficulty,
"novelty": novelty,
"format_compliance": format_compliance,
},
),
}
def get_expert_feedback(
self,
phase_name: str,
reward_modifier: float,
signals: Dict[str, float],
) -> str:
direction = "boosted" if reward_modifier >= 0 else "penalized"
if phase_name == "pedagogy":
return (
f"Pedagogy expert {direction} reward; clarity={signals['clarity']:.2f}, "
f"solvability={signals['solvability']:.2f}, difficulty={signals['difficulty_score']:.2f}."
)
if phase_name == "accuracy":
return (
f"Accuracy expert {direction} reward; solvability={signals['solvability']:.2f}, "
f"format={signals['format_compliance']:.2f}."
)
return (
f"Challenge expert {direction} reward; difficulty={signals['difficulty_score']:.2f}, "
f"novelty={signals['novelty']:.2f}, format={signals['format_compliance']:.2f}."
)
|