File size: 6,412 Bytes
b0b140b
 
 
 
 
 
 
 
 
 
 
962ad43
 
 
 
b0b140b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""Reward computation for OptCoder and LandscapeForge.

Matches §9 of LANDSCAPEFORGE_DESIGN.md (v0.2).
"""

from __future__ import annotations

from dataclasses import dataclass

import numpy as np

try:
    from .arena import ArenaResult
except ImportError:  # flat layout (HF Space container)
    from arena import ArenaResult  # type: ignore


# Default weights (§9.1)
W_REGRET = 1.0
W_CONV = 0.3
W_ROBUST = 0.3
W_NOVELTY = 0.1
W_BUDGET = 0.05
W_EVAL_FAIL = 0.5

NOVELTY_GATE = 0.5    # novelty only applied when r_regret > this


@dataclass
class OptCoderReward:
    r_total: float
    breakdown: dict[str, float]


def compute_optcoder_reward(
    arena: ArenaResult,
    adam_arena: ArenaResult,
    actions_used_cost: int,     # sum of per-action costs, not count
    budget_total: int,
    novelty_score: float,       # AST edit distance / len, clamped to [0, 1]
    convergence_step: int | None,
    arena_steps: int,
) -> OptCoderReward:
    """Compute OptCoder's terminal reward (no `f_min` dependency).

    r_regret is driven by Adam-relative descent:
        my_progress  = mean(f_initial - f_final) across seeds for the draft
        adam_progress = same for Adam
        r_regret = clamp(my_progress / max(adam_progress, floor) - 1, -1, +1)

    Interpretation:
        r_regret = +1   → descended ≥ 2× as far as Adam
        r_regret =  0   → matched Adam's descent
        r_regret = -1   → descended ≤ 0 while Adam descended normally
    """
    my_progress = arena.mean_progress
    adam_progress = adam_arena.mean_progress
    # Denominator floor: if Adam barely descended (e.g. plateau landscape),
    # use ~1% of initial |f| to avoid a tiny denominator exploding the ratio.
    denom_floor = 0.01 * adam_arena.mean_initial_scale + 1e-6
    denom = max(adam_progress, denom_floor)

    r_regret_raw = my_progress / denom - 1.0
    r_regret = float(np.clip(r_regret_raw, -1.0, 1.0))

    # Convergence speed: 1 if hit 1% of initial f before N steps
    if convergence_step is None or convergence_step >= arena_steps:
        r_conv = 0.0
    else:
        r_conv = float(np.clip(1.0 - convergence_step / arena_steps, 0.0, 1.0))

    r_robust = arena.robustness

    # Novelty gated on regret performance
    r_novelty = float(novelty_score) if r_regret > NOVELTY_GATE else 0.0

    r_budget = float(np.clip(actions_used_cost / max(budget_total, 1), 0.0, 1.0))
    r_eval_fail = arena.crash_fraction

    total = (
        W_REGRET * r_regret
        + W_CONV * r_conv
        + W_ROBUST * r_robust
        + W_NOVELTY * r_novelty
        - W_BUDGET * r_budget
        - W_EVAL_FAIL * r_eval_fail
    )

    return OptCoderReward(
        r_total=float(total),
        breakdown={
            "r_regret": r_regret,
            "r_convergence": r_conv,
            "r_robustness": r_robust,
            "r_novelty": r_novelty,
            "r_budget": r_budget,
            "r_eval_failures": r_eval_fail,
            "my_progress": float(my_progress),
            "adam_progress": float(adam_progress),
            "speedup_vs_adam": float(my_progress / denom),
        },
    )


def ast_novelty_score(committed_source: str, reference_sources: list[str]) -> float:
    """Coarse AST-diff score vs a set of reference optimizers.

    Returns min over references of (edit distance / len of committed), clamped
    to [0, 1]. Near-zero means heavy copy. For v1 we use a simple char-level
    Levenshtein-ish ratio; AST diffing is deferred.
    """
    if not committed_source:
        return 0.0
    best = 1.0
    for ref in reference_sources:
        ratio = _diff_ratio(committed_source, ref)
        if ratio < best:
            best = ratio
    return float(np.clip(best, 0.0, 1.0))


def _diff_ratio(a: str, b: str) -> float:
    """difflib-based ratio: 1 - similarity. Cheap, roughly AST-order-insensitive."""
    import difflib
    sim = difflib.SequenceMatcher(None, a, b).ratio()
    return 1.0 - sim


# ---------- Stepwise FEEDBACK (not reward) ----------
# These signals are exposed to the LLM through the observation so it can
# course-correct mid-episode. They are NOT summed into the training reward —
# terminal arena reward is the only GRPO signal, to preserve robustness.

COMPILE_PENALTY_SIGNAL = -0.1
PHI_SCALE = 10.0    # normalizer for best_draft_f to keep potential in ~[-1, 0]


def _draft_potential(draft_history: list[dict]) -> float:
    """Potential: higher = better. No drafts yet → -1.0 (worst) so that the
    first valid draft always emits a positive phi_delta proportional to its
    quality. Clamped to [-1, 0].
    """
    finals = [
        d["summary"]["final_f"]
        for d in draft_history
        if d.get("summary") and d["summary"].get("final_f") is not None
    ]
    if not finals:
        return -1.0   # no valid draft yet → worst-case potential
    best = min(finals)
    normed = min(max(best, 0.0), PHI_SCALE) / PHI_SCALE
    return -normed    # lower best_f → higher potential


def compute_step_reward(prev_draft_history: list[dict],
                        new_draft_history: list[dict],
                        action_kind: str,
                        action_result: dict) -> dict:
    """Stepwise FEEDBACK signals for one REPL turn.

    Despite the name (kept for API compatibility), this does NOT produce
    training reward. The returned dict is structured for surfacing to the
    LLM's next prompt as part of `last_action_result.feedback`:

    - `phi_delta`: improvement in best-draft-so-far. Positive = the latest
        action moved the best known draft closer to the minimum.
    - `compile_penalty`: -0.1 marker indicating the last draft failed to
        compile. Helps the agent notice structural bugs immediately.

    Caller should NOT add the numeric values to the training reward.
    """
    breakdown: dict[str, float] = {}

    phi_prev = _draft_potential(prev_draft_history)
    phi_new = _draft_potential(new_draft_history)
    phi_delta = phi_new - phi_prev
    if abs(phi_delta) > 1e-9:
        breakdown["phi_delta"] = float(phi_delta)

    if action_kind == "draft" and action_result.get("compile_error"):
        breakdown["compile_penalty"] = COMPILE_PENALTY_SIGNAL

    # r_step retained for backwards compatibility but caller must ignore it
    # for training purposes (see docstring).
    r_step = float(sum(breakdown.values()))
    return {"r_step": r_step, "breakdown": breakdown}