File size: 8,212 Bytes
901a0ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b140b
 
 
 
 
 
962ad43
 
 
 
 
 
b0b140b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
901a0ed
 
 
 
 
 
 
b0b140b
 
 
 
901a0ed
 
 
 
 
 
 
b0b140b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
901a0ed
 
 
 
 
 
b0b140b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""Arena β€” run a compiled OptCoder submission on a landscape and score it.

Two entry points:

1. **`run_arena(opt, landscape, seeds, steps)`** β€” Phase-D full evaluation.
   Run the committed optimizer for ``steps`` iterations from each seed's fresh
   random init, collect per-seed trajectories, and return an ``ArenaResult``.
   Used at `commit` time to produce the terminal reward.

2. **`auto_test_draft(opt, landscape, seed, steps)`** β€” lightweight per-draft
   test. Single fixed seed, 20 steps by default. Result is a compact summary
   surfaced to the LLM in the next observation so it can decide whether to
   refine or commit.

Returned summary from ``auto_test_draft`` has these fields (consumed by the
LLM via the observation, and by the demo UI):

    converged    True if final_f < 1% of initial_f (a real convergence signal,
                 not just "code didn't raise").
    diverged     True if code raised a SandboxError (NaN, shape mismatch,
                 timeout, Python exception) OR if final_f blew up to an
                 astronomical value while the optimizer returned finite arrays
                 (a.k.a. "silent divergence" β€” e.g. LR too high, f grows
                 geometrically but no step() call errors).
    error        The SandboxError message if one was raised, else None.
    initial_f    f(x0) β€” the value before any step was taken.
    final_f      f(x_N) β€” the value at step N. This is the single most
                 important diagnostic; an agent looking at final_f=5e13 knows
                 its optimizer exploded even though the code compiled.
    min_f        min over all visited points (catches transient improvement
                 that was then lost).
    step_of_min  index (0..N-1) of the step at which min_f was achieved.

A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
committed β€” the ``compute_optcoder_reward`` pipeline treats exploded final_f
the same as a hard crash.
"""

from dataclasses import dataclass

import numpy as np

try:
    from .landscapes import Landscape
    from .sandbox import CompiledOptimizer, SandboxError
except ImportError:  # flat layout (HF Space container)
    from landscapes import Landscape                       # type: ignore
    from sandbox import CompiledOptimizer, SandboxError    # type: ignore


@dataclass
class ArenaResult:
    initial_values: list[float]   # per-seed f(x_0)
    final_values: list[float]     # per-seed f(x_N); NaN if crashed
    crashed: list[bool]           # per-seed
    trajectories: list[list[dict]]  # per-seed trajectories (may be empty)

    @property
    def mean_progress(self) -> float:
        """Mean descent: f_initial - f_final, averaged across non-crashed seeds.
        Positive = optimizer descended; 0 = stayed put; negative = went uphill.
        Crashed seeds count as 0 progress (conservative).
        """
        prog: list[float] = []
        for init, fin, crashed in zip(self.initial_values, self.final_values,
                                       self.crashed):
            if crashed or not np.isfinite(fin):
                prog.append(0.0)
            else:
                prog.append(init - fin)
        return float(np.mean(prog)) if prog else 0.0

    @property
    def mean_initial_scale(self) -> float:
        """|mean initial f|; used to establish a denominator floor when Adam
        itself makes near-zero progress (rare but possible on plateaus)."""
        vals = [abs(v) for v in self.initial_values if np.isfinite(v)]
        return float(np.mean(vals)) if vals else 1.0

    @property
    def crash_fraction(self) -> float:
        return float(np.mean(self.crashed)) if self.crashed else 0.0

    @property
    def robustness(self) -> float:
        """1 - std/|mean|, clamped to [0, 1]. High = consistent across seeds."""
        vals = [v for v in self.final_values if np.isfinite(v)]
        if len(vals) < 2:
            return 0.0
        m = np.mean(vals)
        s = np.std(vals)
        if abs(m) < 1e-9:
            return 1.0 if s < 1e-6 else 0.0
        return float(np.clip(1.0 - s / (abs(m) + 1e-9), 0.0, 1.0))


def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
              seeds: list[int], steps: int = 200,
              init_scale: float = 0.5) -> ArenaResult:
    """Run the compiled optimizer from fresh seeds; capture per-run metrics.

    Does NOT depend on `ls.f_min` β€” per-seed progress is `f_initial - f_final`,
    which is observable regardless of whether the global minimum is known.
    """
    initials, finals, crashed, trajs = [], [], [], []
    for seed in seeds:
        rng = np.random.default_rng(seed)
        x = rng.normal(0.0, init_scale, size=ls.dim)
        f0 = float(ls.f(x))
        initials.append(f0)
        traj: list[dict] = []
        did_crash = False
        try:
            for t in range(steps):
                fv = float(ls.f(x))
                g = np.asarray(ls.grad(x), dtype=float)
                traj.append({"t": t, "x": x.tolist(), "f": fv})
                x = optimizer.step(x, fv, g)
        except SandboxError:
            did_crash = True

        if did_crash:
            finals.append(float("nan"))
        else:
            finals.append(float(ls.f(x)))
        crashed.append(did_crash)
        trajs.append(traj)

    return ArenaResult(
        initial_values=initials,
        final_values=finals,
        crashed=crashed,
        trajectories=trajs,
    )


#: final_f above this (absolute value) is treated as silent divergence even
#: if every step() call returned finite values. Picked so genuinely bad
#: convergence on stiff landscapes (f around 1e6) still counts as a valid
#: run, but runaway growth (f around 1e10+) gets flagged.
DIVERGENCE_F_THRESHOLD = 1e10


def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
                    seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
    """Single-seed quick test used at draft() time.

    Runs the optimizer for ``steps`` steps from a fixed seed, returning a
    summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).

    Silent-divergence detection: if the code raises, we flag diverged=True as
    usual. But we ALSO flag diverged when the optimizer returned finite arrays
    yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` β€” the common
    "LR too high, f grows geometrically, no errors" case.
    """
    rng = np.random.default_rng(seed)
    x = rng.normal(0.0, init_scale, size=ls.dim)
    x0 = x.copy()
    detail: list[dict] = []
    diverged = False
    err: str | None = None
    try:
        for t in range(steps):
            fv = float(ls.f(x))
            g = np.asarray(ls.grad(x), dtype=float)
            gn = float(np.linalg.norm(g))
            prev_x = x.copy()
            x = optimizer.step(x, fv, g)
            update_norm = float(np.linalg.norm(x - prev_x))
            step_size = update_norm / (gn + 1e-12)
            detail.append({
                "t": t, "x": x.tolist(), "f": float(ls.f(x)),
                "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
            })
            # Silent-divergence guard β€” f exploded even though step() didn't raise
            if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
                diverged = True
                err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} "
                        f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
                break
    except SandboxError as e:
        diverged = True
        err = str(e)

    if diverged or not detail:
        summary = {
            "converged": False, "diverged": True, "error": err,
            "final_f": None, "initial_f": float(ls.f(x0)),
            "step_of_min": None, "min_f": None,
        }
    else:
        fs = [d["f"] for d in detail]
        step_of_min = int(np.argmin(fs))
        summary = {
            "converged": bool(fs[-1] < 0.1 * ls.f(x0)),
            "diverged": False, "error": None,
            "final_f": fs[-1], "initial_f": float(ls.f(x0)),
            "step_of_min": step_of_min, "min_f": min(fs),
        }
    return {"summary": summary, "detail": detail}