Spaces:
Sleeping
Sleeping
| """Reward computation for OptCoder and LandscapeForge. | |
| Matches Β§9 of LANDSCAPEFORGE_DESIGN.md (v0.2). | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| import numpy as np | |
| try: | |
| from .arena import ArenaResult | |
| except ImportError: # flat layout (HF Space container) | |
| from arena import ArenaResult # type: ignore | |
| # Default weights (Β§9.1) | |
| W_REGRET = 1.0 | |
| W_CONV = 0.3 | |
| W_ROBUST = 0.3 | |
| W_NOVELTY = 0.1 | |
| W_BUDGET = 0.05 | |
| W_EVAL_FAIL = 0.5 | |
| NOVELTY_GATE = 0.5 # novelty only applied when r_regret > this | |
| class OptCoderReward: | |
| r_total: float | |
| breakdown: dict[str, float] | |
| def compute_optcoder_reward( | |
| arena: ArenaResult, | |
| adam_arena: ArenaResult, | |
| actions_used_cost: int, # sum of per-action costs, not count | |
| budget_total: int, | |
| novelty_score: float, # AST edit distance / len, clamped to [0, 1] | |
| convergence_step: int | None, | |
| arena_steps: int, | |
| ) -> OptCoderReward: | |
| """Compute OptCoder's terminal reward (no `f_min` dependency). | |
| r_regret is driven by Adam-relative descent: | |
| my_progress = mean(f_initial - f_final) across seeds for the draft | |
| adam_progress = same for Adam | |
| r_regret = clamp(my_progress / max(adam_progress, floor) - 1, -1, +1) | |
| Interpretation: | |
| r_regret = +1 β descended β₯ 2Γ as far as Adam | |
| r_regret = 0 β matched Adam's descent | |
| r_regret = -1 β descended β€ 0 while Adam descended normally | |
| """ | |
| my_progress = arena.mean_progress | |
| adam_progress = adam_arena.mean_progress | |
| # Denominator floor: if Adam barely descended (e.g. plateau landscape), | |
| # use ~1% of initial |f| to avoid a tiny denominator exploding the ratio. | |
| denom_floor = 0.01 * adam_arena.mean_initial_scale + 1e-6 | |
| denom = max(adam_progress, denom_floor) | |
| r_regret_raw = my_progress / denom - 1.0 | |
| r_regret = float(np.clip(r_regret_raw, -1.0, 1.0)) | |
| # Convergence speed: 1 if hit 1% of initial f before N steps | |
| if convergence_step is None or convergence_step >= arena_steps: | |
| r_conv = 0.0 | |
| else: | |
| r_conv = float(np.clip(1.0 - convergence_step / arena_steps, 0.0, 1.0)) | |
| r_robust = arena.robustness | |
| # Novelty gated on regret performance | |
| r_novelty = float(novelty_score) if r_regret > NOVELTY_GATE else 0.0 | |
| r_budget = float(np.clip(actions_used_cost / max(budget_total, 1), 0.0, 1.0)) | |
| r_eval_fail = arena.crash_fraction | |
| total = ( | |
| W_REGRET * r_regret | |
| + W_CONV * r_conv | |
| + W_ROBUST * r_robust | |
| + W_NOVELTY * r_novelty | |
| - W_BUDGET * r_budget | |
| - W_EVAL_FAIL * r_eval_fail | |
| ) | |
| return OptCoderReward( | |
| r_total=float(total), | |
| breakdown={ | |
| "r_regret": r_regret, | |
| "r_convergence": r_conv, | |
| "r_robustness": r_robust, | |
| "r_novelty": r_novelty, | |
| "r_budget": r_budget, | |
| "r_eval_failures": r_eval_fail, | |
| "my_progress": float(my_progress), | |
| "adam_progress": float(adam_progress), | |
| "speedup_vs_adam": float(my_progress / denom), | |
| }, | |
| ) | |
| def ast_novelty_score(committed_source: str, reference_sources: list[str]) -> float: | |
| """Coarse AST-diff score vs a set of reference optimizers. | |
| Returns min over references of (edit distance / len of committed), clamped | |
| to [0, 1]. Near-zero means heavy copy. For v1 we use a simple char-level | |
| Levenshtein-ish ratio; AST diffing is deferred. | |
| """ | |
| if not committed_source: | |
| return 0.0 | |
| best = 1.0 | |
| for ref in reference_sources: | |
| ratio = _diff_ratio(committed_source, ref) | |
| if ratio < best: | |
| best = ratio | |
| return float(np.clip(best, 0.0, 1.0)) | |
| def _diff_ratio(a: str, b: str) -> float: | |
| """difflib-based ratio: 1 - similarity. Cheap, roughly AST-order-insensitive.""" | |
| import difflib | |
| sim = difflib.SequenceMatcher(None, a, b).ratio() | |
| return 1.0 - sim | |
| # ---------- Stepwise FEEDBACK (not reward) ---------- | |
| # These signals are exposed to the LLM through the observation so it can | |
| # course-correct mid-episode. They are NOT summed into the training reward β | |
| # terminal arena reward is the only GRPO signal, to preserve robustness. | |
| COMPILE_PENALTY_SIGNAL = -0.1 | |
| PHI_SCALE = 10.0 # normalizer for best_draft_f to keep potential in ~[-1, 0] | |
| def _draft_potential(draft_history: list[dict]) -> float: | |
| """Potential: higher = better. No drafts yet β -1.0 (worst) so that the | |
| first valid draft always emits a positive phi_delta proportional to its | |
| quality. Clamped to [-1, 0]. | |
| """ | |
| finals = [ | |
| d["summary"]["final_f"] | |
| for d in draft_history | |
| if d.get("summary") and d["summary"].get("final_f") is not None | |
| ] | |
| if not finals: | |
| return -1.0 # no valid draft yet β worst-case potential | |
| best = min(finals) | |
| normed = min(max(best, 0.0), PHI_SCALE) / PHI_SCALE | |
| return -normed # lower best_f β higher potential | |
| def compute_step_reward(prev_draft_history: list[dict], | |
| new_draft_history: list[dict], | |
| action_kind: str, | |
| action_result: dict) -> dict: | |
| """Stepwise FEEDBACK signals for one REPL turn. | |
| Despite the name (kept for API compatibility), this does NOT produce | |
| training reward. The returned dict is structured for surfacing to the | |
| LLM's next prompt as part of `last_action_result.feedback`: | |
| - `phi_delta`: improvement in best-draft-so-far. Positive = the latest | |
| action moved the best known draft closer to the minimum. | |
| - `compile_penalty`: -0.1 marker indicating the last draft failed to | |
| compile. Helps the agent notice structural bugs immediately. | |
| Caller should NOT add the numeric values to the training reward. | |
| """ | |
| breakdown: dict[str, float] = {} | |
| phi_prev = _draft_potential(prev_draft_history) | |
| phi_new = _draft_potential(new_draft_history) | |
| phi_delta = phi_new - phi_prev | |
| if abs(phi_delta) > 1e-9: | |
| breakdown["phi_delta"] = float(phi_delta) | |
| if action_kind == "draft" and action_result.get("compile_error"): | |
| breakdown["compile_penalty"] = COMPILE_PENALTY_SIGNAL | |
| # r_step retained for backwards compatibility but caller must ignore it | |
| # for training purposes (see docstring). | |
| r_step = float(sum(breakdown.values())) | |
| return {"r_step": r_step, "breakdown": breakdown} | |