Spaces:
Sleeping
Sleeping
File size: 6,412 Bytes
b0b140b 962ad43 b0b140b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """Reward computation for OptCoder and LandscapeForge.
Matches §9 of LANDSCAPEFORGE_DESIGN.md (v0.2).
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
try:
from .arena import ArenaResult
except ImportError: # flat layout (HF Space container)
from arena import ArenaResult # type: ignore
# Default weights (§9.1)
W_REGRET = 1.0
W_CONV = 0.3
W_ROBUST = 0.3
W_NOVELTY = 0.1
W_BUDGET = 0.05
W_EVAL_FAIL = 0.5
NOVELTY_GATE = 0.5 # novelty only applied when r_regret > this
@dataclass
class OptCoderReward:
r_total: float
breakdown: dict[str, float]
def compute_optcoder_reward(
arena: ArenaResult,
adam_arena: ArenaResult,
actions_used_cost: int, # sum of per-action costs, not count
budget_total: int,
novelty_score: float, # AST edit distance / len, clamped to [0, 1]
convergence_step: int | None,
arena_steps: int,
) -> OptCoderReward:
"""Compute OptCoder's terminal reward (no `f_min` dependency).
r_regret is driven by Adam-relative descent:
my_progress = mean(f_initial - f_final) across seeds for the draft
adam_progress = same for Adam
r_regret = clamp(my_progress / max(adam_progress, floor) - 1, -1, +1)
Interpretation:
r_regret = +1 → descended ≥ 2× as far as Adam
r_regret = 0 → matched Adam's descent
r_regret = -1 → descended ≤ 0 while Adam descended normally
"""
my_progress = arena.mean_progress
adam_progress = adam_arena.mean_progress
# Denominator floor: if Adam barely descended (e.g. plateau landscape),
# use ~1% of initial |f| to avoid a tiny denominator exploding the ratio.
denom_floor = 0.01 * adam_arena.mean_initial_scale + 1e-6
denom = max(adam_progress, denom_floor)
r_regret_raw = my_progress / denom - 1.0
r_regret = float(np.clip(r_regret_raw, -1.0, 1.0))
# Convergence speed: 1 if hit 1% of initial f before N steps
if convergence_step is None or convergence_step >= arena_steps:
r_conv = 0.0
else:
r_conv = float(np.clip(1.0 - convergence_step / arena_steps, 0.0, 1.0))
r_robust = arena.robustness
# Novelty gated on regret performance
r_novelty = float(novelty_score) if r_regret > NOVELTY_GATE else 0.0
r_budget = float(np.clip(actions_used_cost / max(budget_total, 1), 0.0, 1.0))
r_eval_fail = arena.crash_fraction
total = (
W_REGRET * r_regret
+ W_CONV * r_conv
+ W_ROBUST * r_robust
+ W_NOVELTY * r_novelty
- W_BUDGET * r_budget
- W_EVAL_FAIL * r_eval_fail
)
return OptCoderReward(
r_total=float(total),
breakdown={
"r_regret": r_regret,
"r_convergence": r_conv,
"r_robustness": r_robust,
"r_novelty": r_novelty,
"r_budget": r_budget,
"r_eval_failures": r_eval_fail,
"my_progress": float(my_progress),
"adam_progress": float(adam_progress),
"speedup_vs_adam": float(my_progress / denom),
},
)
def ast_novelty_score(committed_source: str, reference_sources: list[str]) -> float:
"""Coarse AST-diff score vs a set of reference optimizers.
Returns min over references of (edit distance / len of committed), clamped
to [0, 1]. Near-zero means heavy copy. For v1 we use a simple char-level
Levenshtein-ish ratio; AST diffing is deferred.
"""
if not committed_source:
return 0.0
best = 1.0
for ref in reference_sources:
ratio = _diff_ratio(committed_source, ref)
if ratio < best:
best = ratio
return float(np.clip(best, 0.0, 1.0))
def _diff_ratio(a: str, b: str) -> float:
"""difflib-based ratio: 1 - similarity. Cheap, roughly AST-order-insensitive."""
import difflib
sim = difflib.SequenceMatcher(None, a, b).ratio()
return 1.0 - sim
# ---------- Stepwise FEEDBACK (not reward) ----------
# These signals are exposed to the LLM through the observation so it can
# course-correct mid-episode. They are NOT summed into the training reward —
# terminal arena reward is the only GRPO signal, to preserve robustness.
COMPILE_PENALTY_SIGNAL = -0.1
PHI_SCALE = 10.0 # normalizer for best_draft_f to keep potential in ~[-1, 0]
def _draft_potential(draft_history: list[dict]) -> float:
"""Potential: higher = better. No drafts yet → -1.0 (worst) so that the
first valid draft always emits a positive phi_delta proportional to its
quality. Clamped to [-1, 0].
"""
finals = [
d["summary"]["final_f"]
for d in draft_history
if d.get("summary") and d["summary"].get("final_f") is not None
]
if not finals:
return -1.0 # no valid draft yet → worst-case potential
best = min(finals)
normed = min(max(best, 0.0), PHI_SCALE) / PHI_SCALE
return -normed # lower best_f → higher potential
def compute_step_reward(prev_draft_history: list[dict],
new_draft_history: list[dict],
action_kind: str,
action_result: dict) -> dict:
"""Stepwise FEEDBACK signals for one REPL turn.
Despite the name (kept for API compatibility), this does NOT produce
training reward. The returned dict is structured for surfacing to the
LLM's next prompt as part of `last_action_result.feedback`:
- `phi_delta`: improvement in best-draft-so-far. Positive = the latest
action moved the best known draft closer to the minimum.
- `compile_penalty`: -0.1 marker indicating the last draft failed to
compile. Helps the agent notice structural bugs immediately.
Caller should NOT add the numeric values to the training reward.
"""
breakdown: dict[str, float] = {}
phi_prev = _draft_potential(prev_draft_history)
phi_new = _draft_potential(new_draft_history)
phi_delta = phi_new - phi_prev
if abs(phi_delta) > 1e-9:
breakdown["phi_delta"] = float(phi_delta)
if action_kind == "draft" and action_result.get("compile_error"):
breakdown["compile_penalty"] = COMPILE_PENALTY_SIGNAL
# r_step retained for backwards compatibility but caller must ignore it
# for training purposes (see docstring).
r_step = float(sum(breakdown.values()))
return {"r_step": r_step, "breakdown": breakdown}
|